def column_expr(dataset, column_name, query: Query, parsing_context: ParsingContext, alias=None, aggregate=None): """ Certain special column names expand into more complex expressions. Return a 2-tuple of: (expanded column expression, sanitized alias) Needs the body of the request for some extra data used to expand column expressions. """ assert column_name or aggregate assert not aggregate or (aggregate and (column_name or alias)) column_name = column_name or '' if is_function(column_name, 0): return complex_column_expr(dataset, column_name, query, parsing_context) elif isinstance(column_name, (list, tuple)) and aggregate: return complex_column_expr(dataset, [aggregate, column_name, alias], query, parsing_context) elif isinstance(column_name, str) and QUOTED_LITERAL_RE.match(column_name): return escape_literal(column_name[1:-1]) else: expr = dataset.column_expr(column_name, query, parsing_context) if aggregate: expr = function_expr(aggregate, expr) alias = escape_alias(alias or column_name) return alias_expr(expr, alias, parsing_context)
def column_expr( dataset, column_name, query: Query, parsing_context: ParsingContext, alias=None, aggregate=None, ): """ Certain special column names expand into more complex expressions. Return a 2-tuple of: (expanded column expression, sanitized alias) Needs the body of the request for some extra data used to expand column expressions. """ assert column_name or aggregate assert not aggregate or (aggregate and (column_name or alias)) column_name = column_name or "" if is_function(column_name, 0): return complex_column_expr(dataset, column_name, query, parsing_context) elif isinstance(column_name, (list, tuple)) and aggregate: return complex_column_expr( dataset, [aggregate, column_name, alias], query, parsing_context ) elif isinstance(column_name, str) and QUOTED_LITERAL_RE.match(column_name): return escape_literal(column_name[1:-1]) else: expr = dataset.column_expr(column_name, query, parsing_context) if aggregate: expr = function_expr(aggregate, expr) # in the ORDER BY clause, column_expr may receive column names prefixed with # `-`. This is meant to be used for ORDER BY ... DESC. # This means we need to keep the `-` outside of the aliased expression when # we produce something like (COL AS alias) otherwise we build an invalid # syntax. # Worse, since escape_alias already does half of this work and keeps `-` # outside of the escaped expression we end up in this situation: # # -events.event_id becomes (-events.event_id AS -`events.event_id`) # # Thus here we strip the `-` before processing escaping and aliases and we # attach it back to the expression right before returning so that # -events.event_id becomes -(events.event_id AS `events.event_id`) # or # -`events.event_id` # if the alias already existed. # # The proper solution would be to strip the `-` before getting to column # processing, but this will be done with the new column abstraction. negate, col = NEGATE_RE.match(column_name).groups() alias = escape_alias(alias or col) expr_negate, expr = NEGATE_RE.match(expr).groups() # expr_negate and negate should never be inconsistent with each other. Though # will ensure this works properly before moving the `-` stripping at the beginning # of the method to cover tags as well. return f"{negate or expr_negate}{alias_expr(expr, alias, parsing_context)}"
def parse_expression(val: Any, dataset_columns: ColumnSet, arrayjoin: Set[str]) -> Expression: """ Parse a simple or structured expression encoded in the Snuba query language into an AST Expression. """ if is_function(val, 0): return parse_function_to_expr(val, dataset_columns, arrayjoin) if isinstance(val, str): return parse_string_to_expr(val) raise ParsingException( f"Expression to parse can only be a function or a string: {val}")
def complex_column_expr(dataset, expr, query: Query, parsing_context: ParsingContext, depth=0): function_tuple = is_function(expr, depth) if function_tuple is None: raise ValueError( 'complex_column_expr was given an expr %s that is not a function at depth %d.' % (expr, depth)) name, args, alias = function_tuple out = [] i = 0 while i < len(args): next_2 = args[i:i + 2] if is_function(next_2, depth + 1): out.append( complex_column_expr(dataset, next_2, query, parsing_context, depth + 1)) i += 2 else: nxt = args[i] if is_function(nxt, depth + 1): # Embedded function out.append( complex_column_expr(dataset, nxt, query, parsing_context, depth + 1)) elif isinstance(nxt, str): out.append(column_expr(dataset, nxt, query, parsing_context)) else: out.append(escape_literal(nxt)) i += 1 ret = function_expr(name, ', '.join(out)) if alias: ret = alias_expr(ret, alias, parsing_context) return ret
def parse_expression(val: Any) -> Expression: if is_function(val, 0): return parse_function_to_expr(val) # TODO: This will use the schema of the dataset to decide # if the expression is a column or a literal. if val.isdigit(): return Literal(None, int(val)) else: try: return Literal(None, float(val)) except Exception: if QUOTED_LITERAL_RE.match(val): return Literal(None, val[1:-1]) else: return Column(None, val, None)
def __is_optimizable(self, condition: Condition, column: str) -> Optional[OptimizableCondition]: """ Recognize if the condition can be optimized. This includes these kind of conditions: - top level conditions. No nested OR - the condition has to be either in the form tag[t] = value - functions referencing the tags as parameters are not taken into account except for ifNull. - Both EQ and NEQ conditions are optimized. """ if not is_condition(condition): return None if condition[1] not in [Operand.EQ.value, Operand.NEQ.value]: return None if not isinstance(condition[2], str): # We can only support literals for now. return None lhs = condition[0] # This unpacks the ifNull function. This is just an optimization to make this class more # useful since the product wraps tags access into ifNull very often and it is a trivial # function to unpack. We could exptend it to more functions later. function_expr = is_function(lhs, 0) if function_expr and function_expr[0] == "ifNull" and len( function_expr[1]) > 0: lhs = function_expr[1][0] if not isinstance(lhs, str): return None # Now we have a condition in the form of: ["tags[something]", "=", "a string"] tag = NESTED_COL_EXPR_RE.match(lhs) if tag and tag[1] == self.__nested_col: # tag[0] is the full expression that matches the re. nested_col_key = tag[2] return OptimizableCondition( nested_col_key=nested_col_key, operand=Operand.EQ if condition[1] == "=" else Operand.NEQ, value=condition[2], ) return None
def _parse_query_impl(body: MutableMapping[str, Any], entity: Entity) -> Query: def build_selected_expressions( raw_expressions: Sequence[Any], ) -> List[SelectedExpression]: output = [] for raw_expression in raw_expressions: exp = parse_expression(tuplify(raw_expression), entity.get_data_model(), set()) output.append( SelectedExpression( # An expression in the query can be a string or a # complex list with an alias. In the second case # we trust the parser to find the alias. name=raw_expression if isinstance(raw_expression, str) else exp.alias, expression=exp, )) return output aggregations = [] for aggregation in body.get("aggregations", []): if not isinstance(aggregation, Sequence): raise ParsingException(( f"Invalid aggregation structure {aggregation}. " "It must be a sequence containing expression, column and alias." )) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregations.append( SelectedExpression( name=alias, expression=parse_aggregation( aggregation_function, column_expr, alias, entity.get_data_model(), set(), ), )) groupby_clause = build_selected_expressions( to_list(body.get("groupby", []))) select_clause = ( groupby_clause + aggregations + build_selected_expressions(body.get("selected_columns", []))) array_join_cols = set() arrayjoin = body.get("arrayjoin") # TODO: Properly detect all array join columns in all clauses of the query. # This is missing an arrayJoin in condition with an alias that is then # used in the select. if arrayjoin: array_join_cols.add(arrayjoin) array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"], entity.get_data_model(), {arrayjoin}) else: array_join_expr = None for select_expr in select_clause: if isinstance(select_expr.expression, FunctionCall): if select_expr.expression.function_name == "arrayJoin": parameters = select_expr.expression.parameters if len(parameters) != 1: raise ParsingException( "arrayJoin(...) only accepts a single parameter.") if isinstance(parameters[0], Column): array_join_cols.add(parameters[0].column_name) else: # We only accepts columns or functions that do not # reference columns. We could not say whether we are # actually arrayjoining on the values of the column # if it is nested in an arbitrary function. But # functions of literals are fine. for e in parameters[0]: if isinstance(e, Column): raise ParsingException( "arrayJoin(...) cannot contain columns nested in functions." ) where_expr = parse_conditions_to_expr(body.get("conditions", []), entity, array_join_cols) having_expr = parse_conditions_to_expr(body.get("having", []), entity, array_join_cols) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is a string, " "it must respect the format `[-]column`")) direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is an expression, " "the function name must respect the format `[-]func_name`" )) direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ParsingException( (f"Invalid Order By clause {orderby}. The Clause was neither " "a string nor a function call.")) orderby_parsed = parse_expression(tuplify(orderby), entity.get_data_model(), set()) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) return Query( body, None, selected_columns=select_clause, array_join=array_join_expr, condition=where_expr, groupby=[g.expression for g in groupby_clause], having=having_expr, order_by=orderby_exprs, )
def parse_function( output_builder: Callable[[Optional[str], str, List[TExpression]], TExpression], simple_expression_builder: Callable[[str], TExpression], literal_builder: Callable[[ Optional[Union[str, datetime, date, List[Any], Tuple[Any], numbers.Number]] ], TExpression, ], unpack_array_condition_builder: Callable[ [TExpression, str, Any, Optional[str]], TExpression], dataset_columns: ColumnSet, arrayjoin_cols: Set[str], expr: Any, depth: int = 0, ) -> TExpression: """ Parses a function expression in the Snuba syntax and produces the expected data structure to be used in the Query object. It relies on three functions: - output_builder, this puts alias, function name and parameters together - simple_expression_builder, processes one column given the string name - literal_builder, processes any individual type that represent a literal. The goal of having these three functions is to preserve the parsing algorithm but being able to either produce an AST or the old Clickhouse syntax. """ function_tuple = is_function(expr, depth) if function_tuple is None: raise ParsingException( f"complex_column_expr was given an expr {expr} that is not a function at depth {depth}.", report=False, ) name, args, alias = function_tuple # If the first argument is a simple column name that refers to an array column # (and we are not arrayJoining on that column, which would make it scalar again) # we assume that the user actually means to check if any (or all) items in the # array match the predicate, so we return an `any(x == value for x in array_column)` # type expression. We assume that operators looking for a specific value (IN, =, LIKE) # are looking for rows where any array value matches, and exclusionary operators # (NOT IN, NOT LIKE, !=) are looking for rows where all elements match (eg. all NOT LIKE 'foo'). # This check will only work if the array column is a bare column in the condition. If the array # column is itself nested in further functions, this transform will not work. if name in FUNCTION_TO_OPERATOR: if len(args) == 2 and isinstance(args[0], str) and args[0] in dataset_columns: column = dataset_columns[args[0]] if isinstance(column.type.get_raw(), Array): if (column.flattened not in arrayjoin_cols and column.base_name not in arrayjoin_cols): return unpack_array_condition_builder( simple_expression_builder(args[0]), name, args[1], alias, ) out: List[TExpression] = [] i = 0 while i < len(args): next_2 = args[i:i + 2] if is_function(next_2, depth + 1): out.append( parse_function( output_builder, simple_expression_builder, literal_builder, unpack_array_condition_builder, dataset_columns, arrayjoin_cols, next_2, depth + 1, )) i += 2 else: nxt = args[i] if is_function(nxt, depth + 1): # Embedded function out.append( parse_function( output_builder, simple_expression_builder, literal_builder, unpack_array_condition_builder, dataset_columns, arrayjoin_cols, nxt, depth + 1, )) elif isinstance(nxt, str): out.append(simple_expression_builder(nxt)) else: out.append(literal_builder(nxt)) i += 1 return output_builder(alias, name, out)
def _parse_query_impl(body: MutableMapping[str, Any], dataset: Dataset) -> Query: aggregate_exprs = [] for aggregation in body.get("aggregations", []): assert isinstance(aggregation, (list, tuple)) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregate_exprs.append( parse_aggregation(aggregation_function, column_expr, alias)) groupby_exprs = [ parse_expression(tuplify(group_by)) for group_by in to_list(body.get("groupby", [])) ] select_exprs = [ parse_expression(tuplify(select)) for select in body.get("selected_columns", []) ] selected_cols = groupby_exprs + aggregate_exprs + select_exprs arrayjoin = body.get("arrayjoin") if arrayjoin: array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"]) else: array_join_expr = None where_expr = parse_conditions_to_expr(body.get("conditions", []), dataset, arrayjoin) having_expr = parse_conditions_to_expr(body.get("having", []), dataset, arrayjoin) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) assert match is not None, f"Invalid Order By clause {orderby}" direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ValueError(f"Invalid Order By clause {orderby}") orderby_parsed = parse_expression(tuplify(orderby)) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) source = dataset.get_dataset_schemas().get_read_schema().get_data_source() return Query( body, source, selected_columns=selected_cols, array_join=array_join_expr, condition=where_expr, groupby=groupby_exprs, having=having_expr, order_by=orderby_exprs, )
def parse_function( output_builder: Callable[[Optional[str], str, List[TExpression]], TExpression], simple_expression_builder: Callable[[str], TExpression], literal_builder: Callable[[ Optional[Union[str, datetime, date, List[Any], Tuple[Any], numbers.Number]] ], TExpression, ], expr: Any, depth: int = 0, ) -> TExpression: """ Parses a function expression in the Snuba syntax and produces the expected data structure to be used in the Query object. It relies on three functions: - output_builder, this puts alias, function name and parameters together - simple_expression_builder, processes one column given the string name - literal_builder, processes any individual type that represent a literal. The goal of having these three functions is to preserve the parsing algorithm but being able to either produce an AST or the old Clickhouse syntax. """ function_tuple = is_function(expr, depth) if function_tuple is None: raise ValueError( "complex_column_expr was given an expr %s that is not a function at depth %d." % (expr, depth)) name, args, alias = function_tuple out: List[TExpression] = [] i = 0 while i < len(args): next_2 = args[i:i + 2] if is_function(next_2, depth + 1): out.append( parse_function( output_builder, simple_expression_builder, literal_builder, next_2, depth + 1, )) i += 2 else: nxt = args[i] if is_function(nxt, depth + 1): # Embedded function out.append( parse_function( output_builder, simple_expression_builder, literal_builder, nxt, depth + 1, )) elif isinstance(nxt, str): out.append(simple_expression_builder(nxt)) else: out.append(literal_builder(nxt)) i += 1 return output_builder(alias, name, out)