def test_subqueries_query(self): query = 'SELECT SUM(subquery.Store), SUM(avg_price) FROM (SELECT Store, Temperature, AVG(table1.Fuel_Price) AS avg_price FROM features AS table1 GROUP BY Store, Temperature) AS subquery GROUP BY Temperature;' _ = QueryParser(schema).query(query) query = 'SELECT SUM(Store), SUM(avg_price) FROM (SELECT Store, Temperature, AVG(table1.Fuel_Price) AS avg_price FROM features AS table1 GROUP BY Store, Temperature) AS subquery GROUP BY Temperature;' _ = QueryParser(schema).query(query) query = 'SELECT SUM(avg_price) FROM (SELECT AVG(Fuel_Price) AS avg_price FROM features GROUP BY IsHoliday) AS subquery;' _ = QueryParser(schema).query(query)
def test_cast_float(self): frag = "CAST(EXTRACT(WEEKDAY FROM CAST('2017-05-10 09:01:01' AS TIMESTAMP)) AS FLOAT)" expr = QueryParser().parse_expression(frag) assert(frag.replace(' ', '') == str(expr).replace(' ', '')) v = expr.evaluate({}) assert(isinstance(v, float)) assert(v == 2.0)
def test_check_thresholds_gauss(self): # check tau for various privacy parameters epsilons = [0.1, 2.0] max_contribs = [1, 3] deltas = [10E-5, 10E-15] query = "SELECT COUNT(*) FROM PUMS.PUMS GROUP BY married" reader = PandasReader(df, schema) qp = QueryParser(schema) q = qp.query(query) for eps in epsilons: for d in max_contribs: for delta in deltas: privacy = Privacy(epsilon=eps, delta=delta) privacy.mechanisms.map[Stat.threshold] = Mechanism.gaussian # using slightly different formulations of same formula from different papers # make sure private_reader round-trips gaus_scale = math.sqrt(d) * math.sqrt( 2 * math.log(1.25 / delta)) / eps gaus_rho = 1 + gaus_scale * math.sqrt( 2 * math.log(d / math.sqrt(2 * math.pi * delta))) schema_c = copy.copy(schema) schema_c["PUMS.PUMS"].max_ids = d private_reader = PrivateReader(reader, metadata=schema_c, privacy=privacy) assert (private_reader._options.max_contrib == d) r = private_reader._execute_ast(q) assert (math.isclose(private_reader.tau, gaus_rho, rel_tol=0.03, abs_tol=2))
def test_join_query(self): query = 'SELECT COUNT(Store), COUNT(*) FROM sales' _ = QueryParser(schema).query(query) query = 'SELECT COUNT(table1.Store), COUNT(*) FROM sales AS table1' _ = QueryParser(schema).query(query) query = 'SELECT COUNT(sales.Store), COUNT(*) FROM sales' _ = QueryParser(schema).query(query)
def test_iif(self): qp = QueryParser() c = qp.parse_expression("IIF(x <= 5, y, 0)") bindings = dict([('x', 5), ('y', 10), ('z', 12)]) assert (c.evaluate(bindings) == 10) bindings["x"] = 6 assert (c.evaluate(bindings) == 0) c = qp.parse_expression("IIF(x <= 5, y, 'string')") assert (c.evaluate(bindings) == "string")
def test_rewriting(): for query in queries: try: query = QueryParser(metadata).query(str(query)) dp_query = Rewriter(metadata).query(query) except: raise ValueError(f"Query parse and rewrite failed: {query}") parsed_dp_query = QueryParser(metadata).query(str(dp_query)) assert dp_query == parsed_dp_query
def test_ast_attach_nullable_true(self): query = 'SELECT COUNT("IsHoliday") FROM sales' q = QueryParser(metadata).query(query) assert (q._select_symbols[0].expression.xpath_first( '//TableColumn').nullable == True) query = 'SELECT SUM(Store), "Date" as d FROM features GROUP BY "date"' q = QueryParser(metadata).query(query) assert (q._named_symbols['d'].expression.nullable == True)
def test_simple_case(self): qp = QueryParser() c = qp.parse_expression( "CASE x WHEN 5 THEN 'five' WHEN 6 THEN 'six' ELSE '' END") bindings = dict([('x', 5)]) assert (c.evaluate(bindings) == "five") bindings = dict([('x', 6)]) assert (c.evaluate(bindings) == 'six') bindings = dict([('x', 7)]) assert (c.evaluate(bindings) == '')
def test_variable_replace(self): qp = QueryParser() c = qp.parse_expression( "CASE x WHEN 5 THEN y WHEN 6 THEN z ELSE 0 END") bindings = dict([('x', 5), ('y', 10), ('z', 12)]) assert (c.evaluate(bindings) == 10) bindings['x'] = 6 assert (c.evaluate(bindings) == 12) bindings['x'] = 1 assert (c.evaluate(bindings) == 0)
def test_ast_attach_sens(self): query = 'SELECT SUM("Temperature"), SUM(features."Store") AS store FROM features' q = QueryParser(metadata).query(query) assert (q._select_symbols[0].expression.sensitivity() == 75) assert (q._named_symbols['store'].expression.sensitivity() == 150) query = 'SELECT COUNT(DISTINCT "Temperature"), COUNT(features."Store") AS store FROM features' q = QueryParser(metadata).query(query) assert (q._select_symbols[0].expression.sensitivity() == 1) assert (q._named_symbols['store'].expression.sensitivity() == 1)
def test_full_case(self): qp = QueryParser() c = qp.parse_expression( "CASE WHEN x <= 5 THEN y WHEN x > 6 THEN 0 ELSE z END") bindings = dict([('x', 5), ('y', 10), ('z', 12)]) assert (c.evaluate(bindings) == 10) bindings['x'] = 6 assert (c.evaluate(bindings) == 12) bindings['x'] = 10 assert (c.evaluate(bindings) == 0)
def test_string_bound(self): qp = QueryParser() c = qp.parse_expression( "CASE x WHEN 5 THEN y WHEN 6 THEN z ELSE q END") bindings = dict([('x', 5), ('y', 'ten'), ('z', 'twelve'), ('q', 'zero')]) assert (c.evaluate(bindings) == "ten") bindings['x'] = 6 assert (c.evaluate(bindings) == "twelve") bindings['x'] = 1 assert (c.evaluate(bindings) == "zero")
def runValidate(self): for qs in self.queries: q = QueryParser(metadata).query(qs) try: Validate().validateQuery(q, metadata) except: raise ValueError(f"Validation failed for query: {str(q)}")
def runValidate(self): for qs in self.queries: try: q = QueryParser(metadata).query(qs) Validate().validateQuery(q, metadata) except Exception as e: raise ValueError( f"Parse and validate failed for query: {str(q)}")
def test_execute_with_dpsu(self): schema_dpsu = copy.copy(schema) schema_dpsu["PUMS.PUMS"].use_dpsu = True reader = PandasReader(df, schema_dpsu) private_reader = PrivateReader(reader, schema_dpsu, 1.0) assert (private_reader._options.use_dpsu == True) query = QueryParser(schema_dpsu).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert (private_reader._get_reader(query) is not private_reader.reader)
def test_choose(self): qp = QueryParser() c = qp.parse_expression("CHOOSE(x, 'a', 'b', 'c')") bindings = dict([('x', 3), ('y', 10), ('z', 12)]) assert (c.evaluate(bindings) == "c") bindings["x"] = 1 assert (c.evaluate(bindings) == 'a') bindings["x"] = 0 assert (c.evaluate(bindings) == None) bindings["x"] = 10 assert (c.evaluate(bindings) == None) c = qp.parse_expression("CHOOSE(x, 'a', 5, NULL)") bindings = dict([('x', 3), ('y', 10), ('z', 12)]) assert (c.evaluate(bindings) == None) bindings["x"] = "2" assert (c.evaluate(bindings) == 5) c = qp.parse_expression("CHOOSE(x % 2 + 1, NULL, 5)") bindings["x"] = 13 assert (c.evaluate(bindings) == 5)
def test_execute_without_dpsu(self): schema_no_dpsu = copy.copy(schema) schema_no_dpsu["PUMS.PUMS"].use_dpsu = False reader = PandasReader(df, schema_no_dpsu) private_reader = PrivateReader(reader, schema_no_dpsu, privacy=Privacy(epsilon=1.0)) assert (private_reader._options.use_dpsu == False) query = QueryParser(schema_no_dpsu).queries( "SELECT COUNT(*) AS c FROM PUMS.PUMS GROUP BY married")[0] assert (private_reader._get_reader(query) is private_reader.reader)
def test_same_colname(self): query = 'SELECT sales."Store", features."Store" FROM sales, features' q = QueryParser(metadata).query(query) assert ( q._named_symbols['"sales_Store"'].expression.tablename == 'sales') assert ( q._named_symbols['"sales_Store"'].expression.colname == 'Store') assert (q._named_symbols['"features_Store"'].expression.tablename == 'features') assert ( q._named_symbols['"features_Store"'].expression.colname == 'Store')
def preprocess_df_from_query(schema, df, query_string): """ Returns a dataframe with user_id | tuple based on query grouping keys. """ qp = QueryParser(schema) q = qp.query(query_string) queries = qp.queries(query_string) query_ast = queries[0] group_cols = [ ge.expression.name for ge in query_ast.agg.groupingExpressions ] table_name = q.source.find_node(Table).name key_col = schema[table_name].key_cols()[0].name preprocessed_df = pd.DataFrame() preprocessed_df[key_col] = df[key_col] preprocessed_df["group_cols"] = tuple(df[group_cols].values.tolist()) return preprocessed_df
def runBuild(self, exc): for query in self.queries: failed = False try: qb = QueryParser().query(query) except exc: failed = True if not failed: print( "{0} should have thrown ValueError, but succeeded".format( query)) assert failed
def runBuild(self): for query in self.queries: try: q = QueryParser().query(query) self.walk_children(q) assert query.replace(' ', '').replace( '\n', '').lower() == str(q).replace(' ', '').replace('\n', '').lower() self.runParseAgain(q) except Exception as e: raise ValueError(f"Parse error for {str(query)}: {str(e)}")
def test_with_censor_dims(self): meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = "SELECT COUNT (*) AS foo, COUNT(DISTINCT pid) AS bar FROM PUMS.PUMS" q = QueryParser(meta).query(query) inner, outer = private_reader._rewrite_ast(q) ne = outer.select.namedExpressions assert (ne[0].expression.expression.name != 'keycount') assert (ne[1].expression.expression.name == 'keycount')
def runRewrite(self): qb = QueryParser(metadata).queries(self.queryBatch) for q in qb: try: new_q = Rewriter(metadata).query(q) assert q.has_symbols() assert new_q.has_symbols() assert all([ qt.expression.type() == nqt.expression.type() for qt, nqt in zip(q._select_symbols, new_q._select_symbols) ]) except Exception as e: raise ValueError(f"Rewrite error for query: {str(q)}")
def test_viz_query_rewritten(self): query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age" parsed_query = QueryParser(schema).query(query) reader = PandasReader(df, schema) private_reader = PrivateReader(reader, schema, privacy=Privacy(epsilon=1.0)) inner, outer = private_reader._rewrite_ast(parsed_query) graph = outer.visualize(n_trunc=30) assert (isinstance(graph, Digraph)) #graph.render('ast_digraph', view=True, cleanup=True) graph = inner.visualize(n_trunc=30) assert (isinstance(graph, Digraph))
def test_reuse_expression(self): meta = Metadata.from_file(meta_path) df = pd.read_csv(csv_path) reader = PandasReader(df, meta) private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=3.0)) query = 'SELECT AVG(age), SUM(age), COUNT(age) FROM PUMS.PUMS' q = QueryParser(meta).query(query) inner, outer = private_reader._rewrite(query) names = unique( [f.name for f in outer.select.namedExpressions.find_nodes(Column)]) assert (len(names) == 2) assert ('count_age' in names) assert ('sum_age' in names)
def test_empty_result_count_typed_notau_prepost(self): schema_all = copy.deepcopy(schema) schema_all['PUMS.PUMS'].censor_dims = False reader = PandasReader(df, schema) query = QueryParser(schema).queries( "SELECT COUNT(*) as c FROM PUMS.PUMS WHERE age > 100")[0] private_reader = PrivateReader(reader, schema_all, privacy=Privacy(epsilon=1.0)) private_reader._execute_ast(query, True) for i in range(3): print(private_reader._options) trs = private_reader._execute_ast(query, True) print("empty query") print(trs) assert (len(trs) == 2)
from snsql.metadata import Metadata from snsql.sql import PrivateReader from snsql.sql.privacy import Privacy from snsql.sql.parse import QueryParser git_root_dir = subprocess.check_output( "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv")) meta = Metadata.from_file(meta_path) pums = pd.read_csv(csv_path) query = 'SELECT AVG(age), STD(age), VAR(age), SUM(age), COUNT(age) FROM PUMS.PUMS GROUP BY sex' q = QueryParser(meta).query(query) privacy = Privacy(alphas=[0.01, 0.05], delta=1 / (math.sqrt(100) * 100)) priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta) subquery, root = priv._rewrite(query) acc = Accuracy(root, subquery, privacy) class TestAccuracy: def test_count_accuracy(self): error = acc.count(alpha=0.05) assert (error < 7.53978 and error > 0.5) error_wide = acc.count(alpha=0.01) assert (error_wide < 9.909) assert (error_wide > error)
def test_viz_query(self): query = "SELECT SUM(age) AS my_sum FROM pums.pums GROUP BY age" parsed_query = QueryParser().query(query) graph = parsed_query.visualize(color_types={Query: 'red'}, n_trunc=30) assert (isinstance(graph, Digraph))
def test_viz_query_symbols(self): query = "SELECT SUM(age) AS my_sum FROM PUMS.PUMS GROUP BY age" parsed_query = QueryParser(schema).query(query) graph = parsed_query.visualize(color_types={Table: 'red'}, n_trunc=5) assert (isinstance(graph, Digraph))
def qp(query_string): return QueryParser().query(query_string)