def process_functions(exp: Expression) -> Expression: if isinstance(exp, FunctionCall): if exp.function_name == "isHandled": self.validate_parameters(exp) return FunctionCall( exp.alias, "arrayExists", ( Lambda( None, ("x", ), binary_condition( BooleanFunctions.OR, FunctionCall(None, "isNull", (Argument(None, "x"), )), binary_condition( ConditionFunctions.EQ, FunctionCall( None, "assumeNotNull", (Argument(None, "x"), ), ), Literal(None, 1), ), ), ), Column(None, None, self.__column), ), ) if exp.function_name == "notHandled": self.validate_parameters(exp) return FunctionCall( exp.alias, "arrayExists", ( Lambda( None, ("x", ), binary_condition( BooleanFunctions.AND, FunctionCall(None, "isNotNull", (Argument(None, "x"), )), binary_condition( ConditionFunctions.EQ, FunctionCall( None, "assumeNotNull", (Argument(None, "x"), ), ), Literal(None, 0), ), ), ), Column(None, None, self.__column), ), ) return exp
def process_functions(exp: Expression) -> Expression: if isinstance(exp, FunctionCall): if exp.function_name == "uniq": return FunctionCall( exp.alias, "ifNull", ( replace(exp, alias=None), Literal(None, 0), ), ) if exp.function_name == "emptyIfNull": return FunctionCall( exp.alias, "ifNull", ( replace(exp, alias=None), Literal(None, ""), ), ) if exp.function_name == "log": return FunctionCall( exp.alias, "ifNotFinite", ( replace(exp, alias=None), Literal(None, 0), ), ) if isinstance(exp, CurriedFunctionCall): if exp.internal_function.function_name == "top": return replace( exp, internal_function=replace(exp.internal_function, function_name="topK"), ) return exp
def _process_expressions(self, exp: Expression) -> Expression: if isinstance(exp, Column) and exp.column_name in self.columns: return FunctionCall( exp.alias, "arrayMap", ( Lambda( None, ("x",), FunctionCall( None, "replaceAll", ( FunctionCall(None, "toString", (Argument(None, "x"),)), Literal(None, "-"), Literal(None, ""), ), ), ), Column(None, None, exp.column_name), ), ) return exp
def _process_expressions(self, exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "arraySlice": inner_exp = exp.parameters[0] if (isinstance(inner_exp, FunctionCall) and inner_exp.function_name == "arrayMap"): lambda_fn = inner_exp.parameters[0] innermost_exp = inner_exp.parameters[1] slice_args = exp.parameters[1:] return FunctionCall( exp.alias, "arrayMap", ( lambda_fn, FunctionCall( None, "arraySlice", (innermost_exp, ) + slice_args, ), ), ) return exp
def parse(exp: Expression) -> Expression: result = ARRAY_JOIN_MATCH.match(exp) if result: function_name = result.string("function_name") column = result.expression("column") assert isinstance(column, Column) op_literal = result.expression("op") assert isinstance(op_literal, Literal) op = str(op_literal.value) value = result.expression("value") return FunctionCall( None, function_name, ( Lambda( None, ("x", ), FunctionCall( None, "assumeNotNull", (FunctionCall( None, OPERATOR_TO_FUNCTION[op], ( Argument(None, "x"), value, ), ), ), ), ), column, ), ) return exp
def with_required(condition: Optional[Expression] = None) -> Expression: required = binary_condition( BooleanFunctions.AND, FunctionCall( None, "greaterOrEquals", ( Column("_snuba_timestamp", None, "timestamp"), Literal(None, datetime(2021, 1, 1, 0, 0)), ), ), binary_condition( BooleanFunctions.AND, FunctionCall( None, "less", ( Column("_snuba_timestamp", None, "timestamp"), Literal(None, datetime(2021, 1, 2, 0, 0)), ), ), FunctionCall( None, "equals", ( Column("_snuba_project_id", None, "project_id"), Literal(None, 1), ), ), ), ) if condition: return binary_condition(BooleanFunctions.AND, condition, required) return required
def test_expressions_from_basic_condition() -> None: """ Iterates over the expressions in a basic condition f(t1.c1) = t1.c2 """ c = Column(None, "t1", "c1") f1 = FunctionCall(None, "f", (c,)) c2 = Column(None, "t1", "c2") condition = binary_condition(ConditionFunctions.EQ, f1, c2) ret = list(condition) expected = [c, f1, c2, condition] assert ret == expected
def test_events_column_format_expressions() -> None: unprocessed = Query( {}, TableSource("events", ColumnSet([])), selected_columns=[ SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")), SelectedExpression("the_group_id", Column("the_group_id", None, "group_id")), SelectedExpression("the_message", Column("the_message", None, "message")), ], ) expected = Query( {}, TableSource("events", ColumnSet([])), selected_columns=[ SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")), SelectedExpression( "the_group_id", FunctionCall( "the_group_id", "nullIf", ( Column(None, None, "group_id"), Literal(None, 0), ), ), ), SelectedExpression( "the_message", Column("the_message", None, "message"), ), ], ) EventsColumnProcessor().process_query(unprocessed, HTTPRequestSettings()) assert (expected.get_selected_columns_from_ast() == unprocessed.get_selected_columns_from_ast()) expected = ( "(nullIf(group_id, 0) AS the_group_id)", "(message AS the_message)", ) for idx, column in enumerate( unprocessed.get_selected_columns_from_ast()[1:]): formatted = column.expression.accept(ClickhouseExpressionFormatter()) assert expected[idx] == formatted
def attempt_map( self, expression: FunctionCall, children_translator: SnubaClickhouseStrictTranslator, ) -> Optional[FunctionCall]: if not _should_transform_aggregation( expression.function_name, self.from_name, self.column_to_map, expression): return None return FunctionCall( expression.alias, self.to_name, _build_parameters(expression, children_translator, self.aggr_col_name), )
def test_handled_processor_invalid() -> None: columnset = ColumnSet([]) unprocessed = Query( QueryEntity(EntityKey.EVENTS, ColumnSet([])), selected_columns=[ SelectedExpression( "result", FunctionCall("result", "isHandled", (Column(None, None, "type"),),), ), ], ) processor = handled_functions.HandledFunctionsProcessor( "exception_stacks.mechanism_handled", columnset ) with pytest.raises(InvalidExpressionException): processor.process_query(unprocessed, HTTPRequestSettings())
def preprocess_literal(op: str, literal: Any) -> Expression: """ Replaces lists with a function call to tuple. """ if isinstance(literal, (list, tuple)): if op not in ["IN", "NOT IN"]: raise ParsingException(( f"Invalid operator {op} for literal {literal}. Literal is a sequence. " "Operator must be IN/NOT IN")) literals = tuple([Literal(None, lit) for lit in literal]) return FunctionCall(None, "tuple", literals) else: if op in ["IN", "NOT IN"]: raise ParsingException(( f"Invalid operator {op} for literal {literal}. Literal is not a sequence. " "Operator cannot be IN/NOT IN")) return Literal(None, literal)
def test_aliased_expressions_from_basic_condition() -> None: """ Iterates over the expressions in a basic condition when those expressions are aliased f(t1.c1) as a = t1.c2 as a2 """ c = Column(None, "c1", "t1") f1 = FunctionCall("a", "f", [c]) c2 = Column("a2", "c2", "t1") condition = binary_condition(None, ConditionFunctions.EQ, f1, c2) ret = list(condition) expected = [c, f1, c2, condition] assert ret == expected
def test_functions( default_validators: Mapping[str, FunctionCallValidator], dataset_validators: Mapping[str, FunctionCallValidator], exception: Optional[Type[InvalidExpressionException]], ) -> None: functions.default_validators = default_validators dataset = MagicMock() dataset.get_function_call_validators.return_value = dataset_validators dataset.get_abstract_columnset.return_value = ColumnSet([]) expression = FunctionCall( None, "f", (Column(alias=None, table_name=None, column_name="col"), )) if exception is None: FunctionCallsValidator().validate(expression, dataset) else: with pytest.raises(exception): FunctionCallsValidator().validate(expression, dataset)
def transform_expression(exp: Expression) -> Expression: # This is intentionally not configurable in order to discourage creating # a special syntax for expressions that should be function calls. if isinstance(exp, Column) and exp.column_name in ( "tags_key", "tags_value", ): return FunctionCall( exp.alias or exp.column_name, "arrayJoin", (replace( exp, alias=None, column_name=exp.column_name.replace("_", "."), ), ), ) return exp
def process_query(self, query: Query, request_settings: RequestSettings) -> None: if request_settings.get_turbo(): return project_ids = get_project_ids_in_query_ast(query, self.__project_column) set_final = False condition_to_add = None if project_ids: final, exclude_group_ids = get_projects_query_flags( list(project_ids), self.__replacer_state_name, ) if final: metrics.increment("final", tags={"cause": "final_flag"}) if not final and exclude_group_ids: # If the number of groups to exclude exceeds our limit, the query # should just use final instead of the exclusion set. max_group_ids_exclude = get_config( "max_group_ids_exclude", settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE) if len(exclude_group_ids) > max_group_ids_exclude: metrics.increment("final", tags={"cause": "max_groups"}) set_final = True else: condition_to_add = ( ["assumeNotNull", ["group_id"]], "NOT IN", exclude_group_ids, ) query.add_condition_to_ast( not_in_condition( None, FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), [Literal(None, p) for p in exclude_group_ids], )) else: set_final = final query.set_final(set_final) if condition_to_add: query.add_conditions([condition_to_add])
def test_is_x_condition_functions() -> None: eq_condition = binary_condition( ConditionFunctions.EQ, Column(None, None, "test"), Literal(None, "1") ) assert is_any_binary_condition(eq_condition, ConditionFunctions.EQ) assert not is_any_binary_condition(eq_condition, ConditionFunctions.NEQ) un_condition = unary_condition( ConditionFunctions.IS_NOT_NULL, Column(None, None, "test") ) assert is_unary_condition(un_condition, ConditionFunctions.IS_NOT_NULL) assert not is_unary_condition(un_condition, ConditionFunctions.IS_NULL) assert not is_unary_condition(eq_condition, ConditionFunctions.IS_NOT_NULL) almost_condition = FunctionCall(None, "isNotNullish", (Column(None, None, "test"),)) assert is_condition(eq_condition) assert is_condition(un_condition) assert not is_condition(almost_condition)
def test_build_request(body: MutableMapping[str, Any], language: Language, condition: Expression) -> None: dataset = get_dataset("events") entity = dataset.get_default_entity() schema = RequestSchema.build_with_extensions( entity.get_extensions(), HTTPRequestSettings, language, ) request = build_request( body, parse_legacy_query if language == Language.LEGACY else partial( parse_snql_query, []), HTTPRequestSettings, schema, dataset, Timer("test"), "my_request", ) expected_query = Query( from_clause=Entity(EntityKey.EVENTS, entity.get_data_model()), selected_columns=[ SelectedExpression( name="time", expression=Column(alias="_snuba_time", table_name=None, column_name="time"), ), SelectedExpression("count", FunctionCall("_snuba_count", "count", tuple())), ], condition=condition, groupby=[Column("_snuba_time", None, "time")], limit=1000, granularity=60, ) assert request.referrer == "my_request" assert dict(request.body) == body status, differences = request.query.equals(expected_query) assert status == True, f"Query mismatch: {differences}"
def visit_function_call( node: Node, visited_children: Tuple[str, Any, List[Expression], Any, Union[Node, List[Expression]]], ) -> Expression: name, _, params1, _, params2 = visited_children param_list1 = tuple(params1) internal_f = FunctionCall(None, name, param_list1) if isinstance(params2, Node) and params2.text == "": # params2.text == "" means empty node. return internal_f _, param_list2, _ = params2 if isinstance(param_list2, (list, tuple)) and len(param_list2) > 0: param_list2 = tuple(param_list2) else: # This happens when the second parameter list is empty. Somehow # it does not turn into an empty list. param_list2 = () return CurriedFunctionCall(None, internal_f, param_list2)
def test_add_equivalent_condition( initial_condition: Expression, join_clause: JoinClause[EntitySource], expected_expr: Expression, ) -> None: ENTITY_IMPL[EntityKey.EVENTS] = Events() ENTITY_IMPL[EntityKey.GROUPEDMESSAGES] = GroupedMessage() query = CompositeQuery( from_clause=join_clause, selected_columns=[ SelectedExpression( "group_id", FunctionCall("something", "f", (Column(None, "gr", "id"), ))) ], condition=initial_condition, ) add_equivalent_conditions(query) assert query.get_condition() == expected_expr ENTITY_IMPL.clear()
def preprocess_condition_function_literal(func: str, literal: Any) -> Expression: """ Replaces lists with a function call to tuple. """ if isinstance(literal, (list, tuple)): if func not in [ConditionFunctions.IN, ConditionFunctions.NOT_IN]: raise ParsingException(( f"Invalid function {func} for literal {literal}. Literal is a sequence. " "Function must be in()/notIn()")) literals = tuple([parse_string_to_expr(lit) for lit in literal]) return FunctionCall(None, "tuple", literals) else: if func in [ConditionFunctions.IN, ConditionFunctions.NOT_IN]: raise ParsingException(( f"Invalid function {func} for literal {literal}. Literal is not a sequence. " "Function cannot be in()/notIn()")) if isinstance(literal, str): return parse_string_to_expr(literal) else: return Literal(None, literal)
def __process_condition(self, exp: Expression) -> Expression: result = self.condition_match.match(exp) if result is not None: literal = result.expression("literal") assert isinstance(exp, FunctionCall) # mypy assert isinstance(literal, Literal) # mypy try: value = parse_datetime(str(literal.value)) except ValueError as err: column_name = result.string("column_name") raise InvalidQueryException( f"Illegal datetime in condition on column {column_name}: '{literal.value}''" ) from err return FunctionCall( exp.alias, exp.function_name, (exp.parameters[0], Literal(literal.alias, value)), ) return exp
def replace_time_condition_aliases(exp: Expression) -> Expression: if ( isinstance(exp, FunctionCall) and len(exp.parameters) == 2 and isinstance(exp.parameters[0], Column) and exp.parameters[0].alias == "_snuba_timestamp" ): return FunctionCall( exp.alias, exp.function_name, ( Column( f"_snuba_{selected_entity.required_time_column}", exp.parameters[0].table_name, exp.parameters[0].column_name, ), exp.parameters[1], ), ) return exp
def __group_time_function(self, column_name: str, granularity: int, alias: Optional[str]) -> FunctionCall: function_call = { 3600: FunctionCall( alias, "toStartOfHour", (Column(None, None, column_name), Literal(None, "Universal")), ), 60: FunctionCall( alias, "toStartOfMinute", (Column(None, None, column_name), Literal(None, "Universal")), ), 86400: FunctionCall( alias, "toDate", (Column(None, None, column_name), Literal(None, "Universal")), ), }.get(granularity) if not function_call: function_call = FunctionCall( alias, "toDateTime", ( multiply( FunctionCall( None, "intDiv", ( FunctionCall( None, "toUInt32", (Column(None, None, column_name), ), ), Literal(None, granularity), ), ), Literal(None, granularity), ), Literal(None, "Universal"), ), ) return function_call
def parse_aggregation(aggregation_function: str, column: Any, alias: Optional[str]) -> Expression: """ Aggregations, unfortunately, support both Snuba syntax and a subset of ClickHosue syntax. In order to preserve this behavior and still build a meaningful AST when parsing the query, we need to do some parsing of the clickhouse expression. (not that we should support this, but it is used in production). """ expression_tree = minimal_clickhouse_grammar.parse(aggregation_function) parsed_expression = ClickhouseVisitor().visit(expression_tree) if not isinstance(column, (list, tuple)): columns: Iterable[Any] = (column, ) else: columns = column columns_expr = [parse_expression(column) for column in columns if column] if isinstance(parsed_expression, str): # Simple aggregation with snuba syntax ["count", ["c1", "c2"]] return FunctionCall(alias, parsed_expression, tuple(columns_expr)) elif ( # Simple Clickhouse expression with no snuba syntax # ["ifNull(count(somthing), something)", None, None] isinstance(parsed_expression, (FunctionCall, CurriedFunctionCall)) and not columns_expr): return replace(parsed_expression, alias=alias) elif isinstance(parsed_expression, FunctionCall) and columns_expr: # Mix of clickhouse syntax and snuba syntax that generates a CurriedFunction # ["f(a)", "b", None] return CurriedFunctionCall( alias, parsed_expression, tuple(columns_expr), ) else: raise ValueError( f"Invalid aggregation format {aggregation_function} {column}")
def attempt_map( self, expression: CurriedFunctionCall, children_translator: SnubaClickhouseStrictTranslator, ) -> Optional[CurriedFunctionCall]: internal_function = expression.internal_function.accept( children_translator) assert isinstance(internal_function, FunctionCall) # mypy parameters = tuple( p.accept(children_translator) for p in expression.parameters) all_null = True for param in parameters: # Handle wrapped functions that have been converted to ifNull(NULL, NULL) fmatch = self.function_match.match(param) if fmatch is None: if isinstance(param, Literal): if param.value is not None: all_null = False break else: all_null = False break if all_null and len(parameters) > 0: # Currently curried function mappers require returning other curried functions. # So return this to keep the mapper happy. return CurriedFunctionCall( alias=expression.alias, internal_function=FunctionCall( None, f"{internal_function.function_name}OrNull", internal_function.parameters, ), parameters=tuple(Literal(None, None) for p in parameters), ) return None
def __init__(self) -> None: writable_storage = get_writable_storage(StorageKey.SESSIONS_RAW) materialized_storage = get_storage(StorageKey.SESSIONS_HOURLY) read_schema = materialized_storage.get_schema() self.__time_group_columns = {"bucketed_started": "started"} self.__time_parse_columns = ("started", "received") super().__init__( storages=[writable_storage, materialized_storage], # TODO: Once we are ready to expose the raw data model and select whether to use # materialized storage or the raw one here, replace this with a custom storage # selector that decides when to use the materialized data. query_plan_builder=SingleStorageQueryPlanBuilder( storage=materialized_storage, mappers=TranslationMappers(columns=[ ColumnToCurriedFunction( None, "duration_quantiles", FunctionCall( None, "quantilesIfMerge", (Literal(None, 0.5), Literal(None, 0.9)), ), (Column(None, None, "duration_quantiles"), ), ), function_rule("sessions", "countIfMerge"), function_rule("sessions_crashed", "countIfMerge"), function_rule("sessions_abnormal", "countIfMerge"), function_rule("users", "uniqIfMerge"), function_rule("sessions_errored", "uniqIfMerge"), function_rule("users_crashed", "uniqIfMerge"), function_rule("users_abnormal", "uniqIfMerge"), function_rule("users_errored", "uniqIfMerge"), ]), ), abstract_column_set=read_schema.get_columns(), writable_storage=writable_storage, )
def transform_nested_column(exp: Expression) -> Expression: subscript = match_subscriptable_reference(exp) if subscript is None: return exp if subscript.column_name in self.__specs: promoted_col_name = self.__specs[subscript.column_name].get( subscript.key ) if promoted_col_name is not None: col_type = ( query.get_from_clause() .get_columns() .get(promoted_col_name, None) ) col_type_name = str(col_type) if col_type else None # We need to pass the content of the promoted column to a toString # function when the promoted column is not a string since the # supported values of mapping columns are strings and the clients # expect such. if ( col_type_name and "String" in col_type_name and "FixedString" not in col_type_name ): return Column( exp.alias, subscript.table_name, promoted_col_name ) else: return FunctionCall( exp.alias, "toString", (Column(None, subscript.table_name, promoted_col_name),), ) return exp
def __init__(self) -> None: super().__init__( writable_storage_key=StorageKey.METRICS_DISTRIBUTIONS_BUCKETS, readable_storage_key=StorageKey.METRICS_DISTRIBUTIONS, value_schema=[ Column( "percentiles", AggregateFunction("quantiles(0.5, 0.75, 0.9, 0.95, 0.99)", [Float(64)]), ), Column("min", AggregateFunction("min", [Float(64)])), Column("max", AggregateFunction("max", [Float(64)])), Column("avg", AggregateFunction("avg", [Float(64)])), Column("sum", AggregateFunction("sum", [Float(64)])), Column("count", AggregateFunction("count", [Float(64)])), ], mappers=TranslationMappers(columns=[ ColumnToCurriedFunction( None, "percentiles", FunctionCall( None, "quantilesMerge", tuple( Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99]), ), (ColumnExpr(None, None, "percentiles"), ), ), merge_mapper("min"), merge_mapper("max"), merge_mapper("avg"), merge_mapper("sum"), merge_mapper("count"), ], ), )
def time_expr( self, column_name: str, granularity: int, alias: Optional[str] ) -> str: function_call = { 3600: FunctionCall( alias, "toStartOfHour", (Column(None, column_name, None),) ), 60: FunctionCall( alias, "toStartOfMinute", (Column(None, column_name, None),) ), 86400: FunctionCall(alias, "toDate", (Column(None, column_name, None),)), }.get(granularity) if not function_call: # "toDateTime(intDiv(toUInt32({column}), {granularity}) * {granularity})", function_call = FunctionCall( alias, "toDateTime", ( multiply( FunctionCall( None, "intDiv", ( FunctionCall( None, "toUInt32", (Column(None, column_name, None),), ), Literal(None, granularity), ), ), Literal(None, granularity), ), ), ) return function_call
Lambda, Literal, SubscriptableReference, ) test_data = [ Column("alias", "table", "col"), Literal("alias", 123), Argument("alias", "arg"), SubscriptableReference("tags[asd]", Column(None, None, "tags"), Literal(None, "release")), FunctionCall( "alias", "f", ( Column(None, "table", "col"), Literal(None, 123), FunctionCall(None, "f1", (Column(None, None, "col2"), )), ), ), CurriedFunctionCall( None, FunctionCall(None, "f", (Column(None, None, "col"), Literal(None, 12))), (Column(None, None, "col3"), ), ), Lambda(None, ("a", "b"), FunctionCall(None, "f", (Argument(None, "a"), ))), ] @pytest.mark.parametrize("expression", test_data)