def query(self, query): query = QueryParser(self.metadata).query(str(query)) Validate().validateQuery(query, self.metadata) child_scope = Scope() # we make sure aggregates are in select scope for subqueries if query.agg is not None: for ge in query.agg.groupingExpressions: child_scope.push_name(ge.expression) select = Seq([ self.rewrite_outer_named_expression(ne, child_scope) for ne in query.select.namedExpressions ]) select = Select(None, select) subquery = Query(child_scope.select(), query.source, query.where, query.agg, query.having, None, None) subquery = self.exact_aggregates(subquery) subquery = [AliasedRelation(subquery, "exact_aggregates")] q = Query(select, From(subquery), None, query.agg, None, query.order, query.limit) return QueryParser(self.metadata).query(str(q))
def exact_aggregates(self, query): child_scope = Scope() if self.options.row_privacy: keycount_expr = AggFunction("COUNT", None, AllColumns()) else: key_col = self.key_col(query) keycount_expr = AggFunction("COUNT", "DISTINCT", Column(key_col)) child_scope.push_name(keycount_expr.expression) for ne in query.select.namedExpressions: child_scope.push_name(ne.expression) keycount = NamedExpression("keycount", keycount_expr) select = Seq([keycount] + [ne for ne in query.select.namedExpressions]) select = Select(None, select) subquery = Query(child_scope.select(), query.source, query.where, query.agg, None, None, None) if self.options.reservoir_sample and not self.options.row_privacy: subquery = self.per_key_random(subquery) subquery = [AliasedRelation(subquery, "per_key_random")] filtered = Where(BooleanCompare(Column("per_key_random.row_num"), "<=", Literal(str(self.options.max_contrib), self.options.max_contrib))) return Query(select, From(subquery), filtered, query.agg, None, None, None) else: subquery = self.per_key_clamped(subquery) subquery = [AliasedRelation(subquery, "per_key_all")] return Query(select, From(subquery), None, query.agg, None, None, None)
def per_key_clamped(self, query): child_scope = Scope() relations = query.source.relations select = Seq([self.clamp_expression(ne, relations, child_scope, self.options.clamp_columns) for ne in query.select.namedExpressions]) select = Select(None, select) subquery = Query(child_scope.select(), query.source, query.where, None, None, None, None) return subquery
def per_key_random(self, query): key_col = self.key_col(query) select = [ NamedExpression(None, AllColumns()), NamedExpression( Identifier("row_num"), RankingFunction(FuncName("ROW_NUMBER"), OverClause( Column(key_col), Order([ SortItem(BareFunction(FuncName("RANDOM")), None) ]) ), ), ), ] select = Select(None, select) subquery = self.per_key_clamped(query) subquery = [ Relation(AliasedSubquery(subquery, Identifier("clamped" if self.options.clamp_columns else "not_clamped")), None) ] return Query(select, From(subquery), None, None, None, None, None)
def per_key_random(self, query): key_col = self.key_col(query) select = Seq([NamedExpression(None, AllColumns()), NamedExpression("row_num", Expression("ROW_NUMBER() OVER (PARTITION BY {0} ORDER BY random())".format(key_col)))]) select = Select(None, select) subquery = self.per_key_clamped(query) subquery = [AliasedRelation(subquery, "clamped" if self.options.clamp_columns else "not_clamped")] return Query(select, From(subquery), None, None, None, None, None)