def run_agg_query(self, df, metadata, query, confidence, get_exact=True): """ Run the query using the private reader and input query Get query response back """ reader = PandasReader(metadata, df) actual = 0.0 # VAR not supported in Pandas Reader. So not needed to fetch actual on every aggregation if (get_exact): actual = reader.execute_typed(query).rows()[1:][0][0] private_reader = PrivateReader(metadata, reader, self.epsilon) query_ast = private_reader.parse_query_string(query) srs_orig = private_reader.reader.execute_ast_typed(query_ast) noisy_values = [] low_bounds = [] high_bounds = [] for idx in range(self.repeat_count): srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) res = private_reader._execute_ast(query_ast, True) # Disabled because confidence interval not available in report #interval = res.report[res.colnames[0]].intervals[confidence] #low_bounds.append(interval[0].low) #high_bounds.append(interval[0].high) noisy_values.append(res.rows()[1:][0][0]) return np.array(noisy_values), actual, low_bounds, high_bounds
def release(self, dataset: object, actual=False) -> Report: """ Dataset is a collection of [Dataset Metadata, PandasReader] Releases response to SQL query based on the number of repetitions requested by eval_params if actual is set of False. Actual response is only returned once """ if (not actual): private_reader = PrivateReader(dataset[0], dataset[1], self.privacy_params.epsilon) query_ast = private_reader.parse_query_string(self.algorithm) srs_orig = private_reader.reader.execute_ast_typed(query_ast) noisy_values = [] for idx in range(self.eval_params.repeat_count): srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) res = private_reader._execute_ast(query_ast, True) noisy_values.append(res.rows()[1:][0][0]) return Report({"__key__": noisy_values}) else: reader = dataset[1] exact = reader.execute_typed(self.algorithm).rows()[1:][0][0] return Report({"__key__": exact})
def execute_typed(self, query): if not isinstance(query, str): raise ValueError( "Please pass a string to this function. You can use execute_ast to execute ASTs" ) rows = self.execute(query) if len(rows) < 1: return None types = ["unknown" for i in range(len(rows[0]))] if len(rows) > 1: row = rows[1] for idx in range(len(row)): val = row[idx] if isinstance(val, int): types[idx] = "int" elif isinstance(val, float): types[idx] = "float" elif isinstance(val, bool): types[idx] = "boolean" else: types[idx] = "string" return TypedRowset(rows, types)
def test_empty_result_typed(self): reader = PandasReader(schema, df) rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100") trs = TypedRowset(rs, ['int']) assert(len(trs) == 0)
def run_agg_query_df(self, df, metadata, query, confidence, file_name="d1"): # Getting exact result reader = PandasReader(metadata, df) exact = reader.execute_typed(query).rows()[1:] exact_res = [] for row in exact: exact_res.append(row) private_reader = PrivateReader(metadata, reader, self.epsilon) query_ast = private_reader.parse_query_string(query) # Distinguishing dimension and measure columns srs_orig = private_reader.reader.execute_ast_typed(query_ast) srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) sample_res = private_reader._execute_ast(query_ast, True) headers = sample_res.colnames dim_cols = [] num_cols = [] for col in headers: if (sample_res.types[col] == "string"): dim_cols.append(col) else: num_cols.append(col) # Repeated query and store results along with intervals res = [] for idx in range(self.repeat_count): dim_rows = [] num_rows = [] srs = TypedRowset(srs_orig.rows(), list(srs_orig.types.values())) singleres = private_reader._execute_ast(query_ast, True) values = singleres[col] for col in dim_cols: dim_rows.append(singleres[col]) for col in num_cols: values = singleres[col] #low = singleres.report[col].intervals[confidence].low #high = singleres.report[col].intervals[confidence].high #num_rows.append(list(zip(values, low, high))) num_rows.append(list(zip(values))) res.extend(list(zip(*dim_rows, *num_rows))) exact_df = pd.DataFrame(exact_res, columns=headers) noisy_df = pd.DataFrame(res, columns=headers) # Add a dummy dimension column for cases where no dimensions available for merging D1 and D2 if (len(dim_cols) == 0): dim_cols.append("__dim__") if (dim_cols[0] == "__dim__"): exact_df[dim_cols[0]] = ["key"] * len(exact_df) noisy_df[dim_cols[0]] = ["key"] * len(noisy_df) return noisy_df, exact_df, dim_cols, num_cols
def execute_ast_typed(self, query): syms = query.all_symbols() types = [s[1].type() for s in syms] rows = self.execute_ast(query) return TypedRowset(rows, types)
def _execute_ast(self, query, cache_exact=False): if isinstance(query, str): raise ValueError("Please pass AST to _execute.") subquery, query = self.rewrite_ast(query) max_contrib = self.options.max_contrib if self.options.max_contrib is not None else 1 self.tau = max_contrib * ( 1 - (math.log(2 * self.delta / max_contrib) / self.epsilon)) syms = subquery.all_symbols() source_col_names = [s[0] for s in syms] # list of sensitivities in column order sens = [s[1].sensitivity() for s in syms] # tell which are counts, in column order is_count = [s[1].is_count for s in syms] # set sensitivity to None if the column is a grouping key if subquery.agg is not None: group_keys = [ ge.expression.name if hasattr(ge.expression, 'name') else None for ge in subquery.agg.groupingExpressions ] else: group_keys = [] is_group_key = [ colname in group_keys for colname in [s[0] for s in syms] ] for idx in range(len(sens)): if is_group_key[idx]: sens[idx] = None kc_pos = None kcc_pos = [] for idx in range(len(syms)): sname, sym = syms[idx] if sname == 'keycount': kc_pos = idx elif sym.is_key_count: kcc_pos.append(idx) if kc_pos is None and len(kcc_pos) > 0: kc_pos = kcc_pos.pop() # make a list of mechanisms in column order mechs = [ Gaussian(self.epsilon, self.delta, s, max_contrib, self.interval_widths) if s is not None else None for s in sens ] # execute the subquery against the backend and load in tuples if cache_exact: # we only execute the exact query once if self._cached_exact is not None: if subquery == self._cached_ast: db_rs = self._cached_exact else: raise ValueError( "Cannot run different query against cached result. " "Make a new PrivateReader or else clear the cache with cache = False" ) else: db_rs = self._get_reader(subquery).execute_ast(subquery) self._cached_exact = list(db_rs) self._cached_ast = subquery else: self.cached_exact = None self.cached_ast = None db_rs = self._get_reader(subquery).execute_ast(subquery) clamp_counts = self.options.clamp_counts def process_row(row_in): # pull out tuple values row = [v for v in row_in] # set null to 0 before adding noise for idx in range(len(row)): if sens[idx] is not None and row[idx] is None: row[idx] = 0.0 # call all mechanisms to add noise out_row = [ noise.release([v]).values[0] if noise is not None else v for noise, v in zip(mechs, row) ] # ensure all key counts are the same for idx in kcc_pos: out_row[idx] = out_row[kc_pos] # clamp counts to be non-negative if clamp_counts: for idx in range(len(row)): if is_count[idx] and out_row[idx] < 0: out_row[idx] = 0 return out_row if hasattr(db_rs, 'rdd'): # it's a dataframe out = db_rs.rdd.map(process_row) elif hasattr(db_rs, 'map'): # it's an RDD out = db_rs.map(process_row) else: out = map(process_row, db_rs[1:]) if subquery.agg is not None and self.options.censor_dims: if hasattr(out, 'filter'): # it's an RDD tau = self.tau out = out.filter(lambda row: row[kc_pos] > tau) else: out = filter(lambda row: row[kc_pos] > self.tau, out) # get column information for outer query out_syms = query.all_symbols() out_types = [s[1].type() for s in out_syms] out_colnames = [s[0] for s in out_syms] def convert(val, type): if type == 'string' or type == 'unknown': return str(val).replace('"', '').replace("'", '') elif type == 'int': return int(float(str(val).replace('"', '').replace("'", ''))) elif type == 'float': return float(str(val).replace('"', '').replace("'", '')) elif type == 'boolean': if isinstance(val, int): return val != 0 else: return bool(str(val).replace('"', '').replace("'", '')) else: raise ValueError("Can't convert type " + type) def process_out_row(row): bindings = dict((name.lower(), val) for name, val in zip(source_col_names, row)) row = [ c.expression.evaluate(bindings) for c in query.select.namedExpressions ] return [convert(val, type) for val, type in zip(row, out_types)] if hasattr(out, 'map'): # it's an RDD out = out.map(process_out_row) else: out = map(process_out_row, out) # sort it if necessary if query.order is not None: sort_fields = [] for si in query.order.sortItems: if type(si.expression) is not ast.Column: raise ValueError( "We only know how to sort by column names right now") colname = si.expression.name.lower() if colname not in out_colnames: raise ValueError( "Can't sort by {0}, because it's not in output columns: {1}" .format(colname, out_colnames)) colidx = out_colnames.index(colname) desc = False if si.order is not None and si.order.lower() == "desc": desc = True if desc and not (out_types[colidx] in ["int", "float", "boolean"]): raise ValueError( "We don't know how to sort descending by " + out_types[colidx]) sf = (desc, colidx) sort_fields.append(sf) def sort_func(row): return tuple([ row[idx] if not desc else not row[idx] if out_types[idx] == "boolean" else -row[idx] for desc, idx in sort_fields ]) if hasattr(out, 'sortBy'): out = out.sortBy(sort_func) else: out = sorted(out, key=sort_func) # output it if hasattr(out, 'toDF'): # Pipeline RDD return out.toDF(out_colnames) elif hasattr(out, 'map'): # Bare RDD return out else: return TypedRowset([out_colnames] + list(out), out_types)
def test_make_empty(self): trs = TypedRowset(rows_1[0:1], types) assert (len(trs) == 0)
def test_make_1(self): trs = TypedRowset(rows_1, types) assert (len(trs) == 1)