def get_project_ids_in_condition( condition: Expression) -> Optional[Set[int]]: """ Extract project ids from an expression. Returns None if no project if condition is found. It returns an empty set of conflicting project_id conditions are found. """ match = FunctionCall( None, String(ConditionFunctions.EQ), ( Column(column_name=String(project_column)), Literal(value=Param("project_id", Any(int))), ), ).match(condition) if match is not None: return {match.integer("project_id")} match = is_in_condition_pattern( Column(column_name=String(project_column))).match(condition) if match is not None: projects = match.expression("tuple") assert isinstance(projects, FunctionCallExpr) return { l.value for l in projects.parameters if isinstance(l, LiteralExpr) and isinstance(l.value, int) } match = FunctionCall( None, Param( "operator", Or([String(BooleanFunctions.AND), String(BooleanFunctions.OR)]), ), (Param("lhs", AnyExpression()), Param("rhs", AnyExpression())), ).match(condition) if match is not None: lhs_projects = get_project_ids_in_condition( match.expression("lhs")) rhs_projects = get_project_ids_in_condition( match.expression("rhs")) if lhs_projects is None: return rhs_projects elif rhs_projects is None: return lhs_projects else: return (lhs_projects & rhs_projects if match.string("operator") == BooleanFunctions.AND else lhs_projects | rhs_projects) return None
def replace_condition(expression: Expression) -> Expression: match = FunctionCall( String(OPERATOR_TO_FUNCTION[operator]), (Param("column", Column(None, String(field))), AnyExpression()), ).match(expression) return (expression if match is None else replace( expression, parameters=(match.expression("column"), new_operand)))
), ( "Matches a column with all fields", Column( Param("table_name", AnyOptionalString()), Param("column_name", Any(str)), ), ColumnExpr("alias", "table_name", "test_col"), MatchResult({ "column_name": "test_col", "table_name": "table_name" }), ), ( "Match anything", AnyExpression(), ColumnExpr(None, None, "something_irrelevant"), MatchResult(), ), ( "Match a string through Any(str)", Column(Param("p_table_name", Any(str)), None), ColumnExpr("irrelevant", "table_name", "irrelevant"), MatchResult({"p_table_name": "table_name"}), ), ( "Match a None string through Any", Column(Param("p_table_name", Any(type(None))), None), ColumnExpr("irrelevant", None, "irrelevant"), MatchResult({"p_table_name": None}), ),
return binary_condition(alias, function, lhs, literals_tuple(None, rhs)) def __set_condition_pattern(lhs: Pattern[Expression], operator: str) -> FunctionCallPattern: return FunctionCallPattern( String(operator), ( Param("lhs", lhs), Param("tuple", FunctionCallPattern(String("tuple"), None)), ), ) set_condition_pattern = { op: __set_condition_pattern(AnyExpression(), op) for op in FUNCTION_TO_OPERATOR } def __is_set_condition(exp: Expression, operator: str) -> bool: if is_binary_condition(exp, operator): if operator in set_condition_pattern: if set_condition_pattern[operator].match(exp) is not None: assert isinstance(exp, FunctionCall) # mypy assert isinstance(exp.parameters[1], FunctionCall) # mypy # Matchers can't currently match arbitrary numbers of parameters, so test this directly return all( isinstance(c, Literal) for c in exp.parameters[1].parameters)
String(operator), ( Param("lhs", lhs), Param( "sequence", Or([ FunctionCallPattern(String("tuple"), None), FunctionCallPattern(String("array"), None), ]), ), ), ) set_condition_pattern = { op: __set_condition_pattern(AnyExpression(), op) for op in FUNCTION_TO_OPERATOR } def __is_set_condition(exp: Expression, operator: str) -> bool: if is_any_binary_condition(exp, operator): if operator in set_condition_pattern: if set_condition_pattern[operator].match(exp) is not None: assert isinstance(exp, FunctionCall) # mypy assert isinstance(exp.parameters[1], FunctionCall) # mypy # Matchers can't currently match arbitrary numbers of parameters, so test this directly return all( isinstance(c, Literal) for c in exp.parameters[1].parameters)
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or query.get_aggregations() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition_from_ast() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. total_columns = {(col.table_name, col.column_name) for col in query.get_all_ast_referenced_columns()} minimal_query = copy.deepcopy(query) minimal_query.set_selected_columns( [self.__id_column, self.__project_column, self.__timestamp_column]) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression(self.__id_column, ColumnExpr(None, None, self.__id_column)), SelectedExpression(self.__project_column, ColumnExpr(None, None, self.__project_column)), SelectedExpression( self.__timestamp_column, ColumnExpr(None, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) minimal_columns = { (col.table_name, col.column_name) for col in minimal_query.get_all_ast_referenced_columns() } if len(total_columns) <= len(minimal_columns): return None # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None legacy_references = set(minimal_query.get_all_referenced_columns()) ast_column_names = { c.column_name for c in minimal_query.get_all_ast_referenced_columns() } # Ensures the legacy minimal query (which does not expand alias references) # does not contain alias references we removed when creating minimal_query. if legacy_references - ast_column_names: metrics.increment("columns.skip_invalid_legacy_query") return None result = runner(minimal_query, request_settings) del minimal_query if not result.result["data"]: return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_conditions([(self.__id_column, "IN", event_ids)]) query.add_condition_to_ast( in_condition( None, ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) # TODO: This is technically wrong. Event ids are unique per project, not globally. # So, if the minimal query only returned the same event_id from two projects, we # would be underestimating the limit here. query.set_limit(len(event_ids)) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_condition( query, self.__project_column, "IN", project_ids, ) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_condition( query, self.__timestamp_column, ">=", util.parse_datetime(min(timestamps)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_condition( query, self.__timestamp_column, "<", (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, request_settings)
def __set_condition_pattern( lhs: Pattern[Expression], operator: str ) -> FunctionCallPattern: return FunctionCallPattern( String(operator), ( Param("lhs", lhs), Param("tuple", FunctionCallPattern(String("tuple"), None)), ), ) set_condition_pattern = { op: __set_condition_pattern(AnyExpression(), op) for op in FUNCTION_TO_OPERATOR } def __is_set_condition(exp: Expression, operator: str) -> bool: if is_any_binary_condition(exp, operator): if operator in set_condition_pattern: if set_condition_pattern[operator].match(exp) is not None: assert isinstance(exp, FunctionCall) # mypy assert isinstance(exp.parameters[1], FunctionCall) # mypy # Matchers can't currently match arbitrary numbers of parameters, so test this directly return all(isinstance(c, Literal) for c in exp.parameters[1].parameters) return False
def execute( self, query: Query, query_settings: QuerySettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. selected_columns = { (col.table_name, col.column_name) for col in query.get_columns_referenced_in_select() } if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS: metrics.increment("column_splitter.main_query_min_threshold") return None minimal_query = copy.deepcopy(query) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression( self.__id_column, ColumnExpr(self.__id_column, None, self.__id_column), ), SelectedExpression( self.__project_column, ColumnExpr(self.__project_column, None, self.__project_column), ), SelectedExpression( self.__timestamp_column, ColumnExpr(self.__timestamp_column, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT, # they fail on distributed tables. For that specific case, skip the query splitter. for orderby in minimal_query.get_orderby(): if isinstance(orderby.expression, (FunctionCallExpr, CurriedFunctionCallExpr)): metrics.increment("column_splitter.orderby_has_a_function") return None result = runner(minimal_query, query_settings) del minimal_query if not result.result["data"]: metrics.increment("column_splitter.no_data_from_minimal_query") return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_condition_to_ast( in_condition( ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) query.set_limit(len(result.result["data"])) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, query_settings)
date_string = result.expression("date_string") assert isinstance(date_string, Literal) # mypy assert isinstance(date_string.value, str) # mypy return Literal(exp.alias, parse_datetime(date_string.value)) return exp query.transform_expressions(parse) ARRAY_JOIN_MATCH = FunctionCallMatch( Param("function_name", Or([StringMatch("arrayExists"), StringMatch("arrayAll")])), ( Param("column", ColumnMatch(AnyOptionalString(), AnyMatch(str))), Param("op", Or([LiteralMatch(StringMatch(op)) for op in OPERATOR_TO_FUNCTION])), Param("value", AnyExpression()), ), ) def _array_join_transformation( query: Union[CompositeQuery[QueryEntity], LogicalQuery] ) -> None: def parse(exp: Expression) -> Expression: result = ARRAY_JOIN_MATCH.match(exp) if result: function_name = result.string("function_name") column = result.expression("column") assert isinstance(column, Column) op_literal = result.expression("op") assert isinstance(op_literal, Literal)
def __set_condition_pattern( lhs: Pattern[Expression], operator: str ) -> FunctionCallPattern: return FunctionCallPattern( None, String(operator), ( Param("lhs", lhs), Param("tuple", FunctionCallPattern(None, String("tuple"), None)), ), ) set_condition_pattern = { op: __set_condition_pattern(AnyExpression(), op) for op in FUNCTION_TO_OPERATOR } def __is_set_condition(exp: Expression, operator: str) -> bool: if is_binary_condition(exp, operator): if operator in set_condition_pattern: if set_condition_pattern[operator].match(exp) is not None: assert isinstance(exp, FunctionCall) # mypy assert isinstance(exp.parameters[1], FunctionCall) # mypy # Matchers can't currently match arbitrary numbers of parameters, so test this directly return all(isinstance(c, Literal) for c in exp.parameters[1].parameters) return False