def process_column(exp: Expression) -> Expression: match = matcher.match(exp) if match: inner = replace(exp, alias=None) return FunctionCallExpr( exp.alias, "if", ( binary_condition( ConditionFunctions.IN, inner, literals_tuple( None, [ LiteralExpr(None, "1"), LiteralExpr(None, "True") ], ), ), LiteralExpr(None, "True"), LiteralExpr(None, "False"), ), ) return exp
def test_not_in_condition() -> None: not_in_condition = binary_condition( ConditionFunctions.NOT_IN, Column(None, None, "tags_key"), literals_tuple(None, [Literal(None, "t1"), Literal(None, "t2")]), ) assert is_not_in_condition(not_in_condition) match = is_not_in_condition_pattern(ColumnPattern(None, String("tags_key"))).match( not_in_condition ) assert match is not None assert match.expression("tuple") == literals_tuple( None, [Literal(None, "t1"), Literal(None, "t2")] ) assert match.expression("lhs") == Column(None, None, "tags_key")
def test_processing_functions() -> None: in_condition = binary_condition( None, ConditionFunctions.IN, Column(None, "tag_keys", None), literals_tuple( None, [Literal(None, "t1"), Literal(None, "t2")]), ) assert is_in_condition(in_condition) eq_condition = binary_condition(None, ConditionFunctions.EQ, Column(None, "test", None), Literal(None, "1")) assert is_binary_condition(eq_condition, ConditionFunctions.EQ) assert not is_binary_condition(eq_condition, ConditionFunctions.NEQ)
def replace_exp(exp: Expression) -> Expression: if matcher.match(exp) is not None: inner = replace(exp, alias=None) return FunctionCallExpr( exp.alias, "multiIf", ( binary_condition(ConditionFunctions.EQ, inner, Literal(None, "")), Literal(None, ""), binary_condition( ConditionFunctions.IN, inner, literals_tuple( None, [Literal(None, "1"), Literal(None, "True")]), ), Literal(None, "True"), Literal(None, "False"), ), ) return exp
def some_tuple(alias: str | None): return literals_tuple(alias, [Literal(None, "duration"), Literal(None, 300)])
def __set_condition(alias: Optional[str], function: str, lhs: Expression, rhs: Sequence[Literal]) -> Expression: return binary_condition(alias, function, lhs, literals_tuple(None, rhs))
def test_events_boolean_context() -> None: columns = ColumnSet( [("contexts", Nested([("key", String()), ("value", String())]))] ) query = ClickhouseQuery( Table("errors", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], ) expected = ClickhouseQuery( Table("errors", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "if", ( binary_condition( ConditionFunctions.IN, FunctionCall( None, "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], ) settings = HTTPQuerySettings() EventsBooleanContextsProcessor().process_query(query, settings) assert query.get_selected_columns() == expected.get_selected_columns()
def test_events_promoted_boolean_context() -> None: columns = ColumnSet( [ ("device_charging", UInt(8, Modifier(nullable=True))), ("contexts", Nested([("key", String()), ("value", String())])), ] ) query = ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], ) expected = ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "if", ( binary_condition( ConditionFunctions.IN, FunctionCall( None, "toString", (Column(None, None, "device_charging"),), ), literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], ) settings = HTTPQuerySettings() MappingColumnPromoter( {"contexts": {"device.charging": "device_charging"}}, cast_to_string=True ).process_query(query, settings) EventsPromotedBooleanContextsProcessor().process_query(query, settings) assert query.get_selected_columns() == expected.get_selected_columns()
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or query.get_aggregations() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition_from_ast() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. total_columns = {(col.table_name, col.column_name) for col in query.get_all_ast_referenced_columns()} minimal_query = copy.deepcopy(query) minimal_query.set_selected_columns( [self.__id_column, self.__project_column, self.__timestamp_column]) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression(self.__id_column, ColumnExpr(None, None, self.__id_column)), SelectedExpression(self.__project_column, ColumnExpr(None, None, self.__project_column)), SelectedExpression( self.__timestamp_column, ColumnExpr(None, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) minimal_columns = { (col.table_name, col.column_name) for col in minimal_query.get_all_ast_referenced_columns() } if len(total_columns) <= len(minimal_columns): return None # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None legacy_references = set(minimal_query.get_all_referenced_columns()) ast_column_names = { c.column_name for c in minimal_query.get_all_ast_referenced_columns() } # Ensures the legacy minimal query (which does not expand alias references) # does not contain alias references we removed when creating minimal_query. if legacy_references - ast_column_names: metrics.increment("columns.skip_invalid_legacy_query") return None result = runner(minimal_query, request_settings) del minimal_query if not result.result["data"]: return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_conditions([(self.__id_column, "IN", event_ids)]) query.add_condition_to_ast( in_condition( None, ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) # TODO: This is technically wrong. Event ids are unique per project, not globally. # So, if the minimal query only returned the same event_id from two projects, we # would be underestimating the limit here. query.set_limit(len(event_ids)) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_condition( query, self.__project_column, "IN", project_ids, ) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_condition( query, self.__timestamp_column, ">=", util.parse_datetime(min(timestamps)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_condition( query, self.__timestamp_column, "<", (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, request_settings)
def execute( self, query: Query, query_settings: QuerySettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. selected_columns = { (col.table_name, col.column_name) for col in query.get_columns_referenced_in_select() } if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS: metrics.increment("column_splitter.main_query_min_threshold") return None minimal_query = copy.deepcopy(query) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression( self.__id_column, ColumnExpr(self.__id_column, None, self.__id_column), ), SelectedExpression( self.__project_column, ColumnExpr(self.__project_column, None, self.__project_column), ), SelectedExpression( self.__timestamp_column, ColumnExpr(self.__timestamp_column, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT, # they fail on distributed tables. For that specific case, skip the query splitter. for orderby in minimal_query.get_orderby(): if isinstance(orderby.expression, (FunctionCallExpr, CurriedFunctionCallExpr)): metrics.increment("column_splitter.orderby_has_a_function") return None result = runner(minimal_query, query_settings) del minimal_query if not result.result["data"]: metrics.increment("column_splitter.no_data_from_minimal_query") return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_condition_to_ast( in_condition( ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) query.set_limit(len(result.result["data"])) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, query_settings)
def test_events_boolean_context() -> None: columns = ColumnSet([ ("device_charging", Nullable(UInt(8))), ("contexts", Nested([("key", String()), ("value", String())])), ]) query = ClickhouseQuery( LogicalQuery( {}, TableSource("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], )) expected = ClickhouseQuery( LogicalQuery( {}, TableSource("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "multiIf", ( binary_condition( None, ConditionFunctions.EQ, FunctionCall( None, "toString", (Column(None, None, "device_charging"), ), ), Literal(None, ""), ), Literal(None, ""), binary_condition( None, ConditionFunctions.IN, FunctionCall( None, "toString", (Column(None, None, "device_charging"), ), ), literals_tuple(None, [ Literal(None, "1"), Literal(None, "True") ]), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], )) settings = HTTPRequestSettings() MappingColumnPromoter({ "contexts": { "device.charging": "device_charging" } }).process_query(query, settings) EventsBooleanContextsProcessor().process_query(query, settings) assert (query.get_selected_columns_from_ast() == expected.get_selected_columns_from_ast())