def validate(self, exp: Expression, entity: Entity) -> None: if not isinstance(exp, FunctionCall): return entity_validators = entity.get_function_call_validators() common_function_validators = (entity_validators.keys() & default_validators.keys()) if common_function_validators: logger.warning( "Dataset validators are overlapping with default ones. Entity: %s. Overlap %r", entity, common_function_validators, exc_info=True, ) validators = ChainMap(default_validators, entity_validators) try: # TODO: Decide whether these validators should exist at the Dataset or Entity level validator = validators.get(exp.function_name) if validator is not None: validator.validate(exp.parameters, entity.get_data_model()) except InvalidFunctionCall as exception: raise InvalidExpressionException( exp, f"Illegal call to function {exp.function_name}: {str(exception)}", ) from exception
def _parse_query_impl(body: MutableMapping[str, Any], entity: Entity) -> Query: def build_selected_expressions( raw_expressions: Sequence[Any], ) -> List[SelectedExpression]: output = [] for raw_expression in raw_expressions: exp = parse_expression(tuplify(raw_expression), entity.get_data_model(), set()) output.append( SelectedExpression( # An expression in the query can be a string or a # complex list with an alias. In the second case # we trust the parser to find the alias. name=raw_expression if isinstance(raw_expression, str) else exp.alias, expression=exp, )) return output aggregations = [] for aggregation in body.get("aggregations", []): if not isinstance(aggregation, Sequence): raise ParsingException(( f"Invalid aggregation structure {aggregation}. " "It must be a sequence containing expression, column and alias." )) aggregation_function = aggregation[0] column_expr = aggregation[1] column_expr = column_expr if column_expr else [] alias = aggregation[2] alias = alias if alias else None aggregations.append( SelectedExpression( name=alias, expression=parse_aggregation( aggregation_function, column_expr, alias, entity.get_data_model(), set(), ), )) groupby_clause = build_selected_expressions( to_list(body.get("groupby", []))) select_clause = ( groupby_clause + aggregations + build_selected_expressions(body.get("selected_columns", []))) array_join_cols = set() arrayjoin = body.get("arrayjoin") # TODO: Properly detect all array join columns in all clauses of the query. # This is missing an arrayJoin in condition with an alias that is then # used in the select. if arrayjoin: array_join_cols.add(arrayjoin) array_join_expr: Optional[Expression] = parse_expression( body["arrayjoin"], entity.get_data_model(), {arrayjoin}) else: array_join_expr = None for select_expr in select_clause: if isinstance(select_expr.expression, FunctionCall): if select_expr.expression.function_name == "arrayJoin": parameters = select_expr.expression.parameters if len(parameters) != 1: raise ParsingException( "arrayJoin(...) only accepts a single parameter.") if isinstance(parameters[0], Column): array_join_cols.add(parameters[0].column_name) else: # We only accepts columns or functions that do not # reference columns. We could not say whether we are # actually arrayjoining on the values of the column # if it is nested in an arbitrary function. But # functions of literals are fine. for e in parameters[0]: if isinstance(e, Column): raise ParsingException( "arrayJoin(...) cannot contain columns nested in functions." ) where_expr = parse_conditions_to_expr(body.get("conditions", []), entity, array_join_cols) having_expr = parse_conditions_to_expr(body.get("having", []), entity, array_join_cols) orderby_exprs = [] for orderby in to_list(body.get("orderby", [])): if isinstance(orderby, str): match = NEGATE_RE.match(orderby) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is a string, " "it must respect the format `[-]column`")) direction, col = match.groups() orderby = col elif is_function(orderby): match = NEGATE_RE.match(orderby[0]) if match is None: raise ParsingException(( f"Invalid Order By clause {orderby}. If the Order By is an expression, " "the function name must respect the format `[-]func_name`" )) direction, col = match.groups() orderby = [col] + orderby[1:] else: raise ParsingException( (f"Invalid Order By clause {orderby}. The Clause was neither " "a string nor a function call.")) orderby_parsed = parse_expression(tuplify(orderby), entity.get_data_model(), set()) orderby_exprs.append( OrderBy( OrderByDirection.DESC if direction == "-" else OrderByDirection.ASC, orderby_parsed, )) return Query( body, None, selected_columns=select_clause, array_join=array_join_expr, condition=where_expr, groupby=[g.expression for g in groupby_clause], having=having_expr, order_by=orderby_exprs, )
def parse_conditions( operand_builder: Callable[[Any, ColumnSet, Set[str]], TExpression], and_builder: Callable[[Sequence[TExpression]], Optional[TExpression]], or_builder: Callable[[Sequence[TExpression]], Optional[TExpression]], unpack_array_condition_builder: Callable[[TExpression, str, Any], TExpression], simple_condition_builder: Callable[[TExpression, str, Any], TExpression], entity: Entity, conditions: Any, arrayjoin_cols: Set[str], depth: int = 0, ) -> Optional[TExpression]: """ Return a boolean expression suitable for putting in the WHERE clause of the query. The expression is constructed by ANDing groups of OR expressions. Expansion of columns is handled, as is replacement of columns with aliases, if the column has already been expanded and aliased elsewhere. operand_builder: Builds the TExpression representing the left hand side of a simple condition. This can be as nested as the user wants and_builder / or_builder: Combine a list of expressions in AND/OR unpack_array_condition_builder: Deals with a special case where we unpack conditions on array columns. More details in the code. simple_condition_builder: Generates a simple condition made by expression on the left hand side, an operator and a literal on the right hand side. """ from snuba.clickhouse.columns import Array if not conditions: return None if depth == 0: # dedupe conditions at top level, but keep them in order sub = OrderedDict(( parse_conditions( operand_builder, and_builder, or_builder, unpack_array_condition_builder, simple_condition_builder, entity, cond, arrayjoin_cols, depth + 1, ), None, ) for cond in conditions) return and_builder([s for s in sub.keys() if s]) elif is_condition(conditions): try: lhs, op, lit = conditions except Exception as cause: raise ParsingException(f"Cannot process condition {conditions}", cause) from cause # facilitate deduping IN conditions by sorting them. if op in ("IN", "NOT IN") and isinstance(lit, tuple): lit = tuple(sorted(lit)) # If the LHS is a simple column name that refers to an array column # (and we are not arrayJoining on that column, which would make it # scalar again) and the RHS is a scalar value, we assume that the user # actually means to check if any (or all) items in the array match the # predicate, so we return an `any(x == value for x in array_column)` # type expression. We assume that operators looking for a specific value # (IN, =, LIKE) are looking for rows where any array value matches, and # exclusionary operators (NOT IN, NOT LIKE, !=) are looking for rows # where all elements match (eg. all NOT LIKE 'foo'). columns = entity.get_data_model() if (isinstance(lhs, str) and lhs in columns and isinstance(columns[lhs].type, Array) and columns[lhs].base_name not in arrayjoin_cols and columns[lhs].flattened not in arrayjoin_cols and not isinstance(lit, (list, tuple))): return unpack_array_condition_builder( operand_builder(lhs, entity.get_data_model(), arrayjoin_cols), op, lit, ) else: return simple_condition_builder( operand_builder(lhs, entity.get_data_model(), arrayjoin_cols), op, lit, ) elif depth == 1: sub_expression = (parse_conditions( operand_builder, and_builder, or_builder, unpack_array_condition_builder, simple_condition_builder, entity, cond, arrayjoin_cols, depth + 1, ) for cond in conditions) return or_builder([s for s in sub_expression if s]) else: raise InvalidConditionException(str(conditions))