def _list_array_join(query: Query) -> Columnset: ret = set() query_arrayjoin = query.get_arrayjoin_from_ast() if query_arrayjoin is not None: ret |= _get_columns_from_expression(query_arrayjoin) for e in query.get_all_expressions(): if isinstance(e, FunctionCallExpr) and e.function_name == "arrayJoin": ret |= _get_columns_from_expression(e) return ret
def __get_array_joins_in_query(self, query: Query) -> Set[str]: """ Get all of the arrayJoins on the possible columns that are present in the query. """ array_joins_in_query: Set[str] = set() for e in query.get_all_expressions(): match = self.__array_join_pattern.match(e) if match is not None: array_joins_in_query.add(match.string("col")) return array_joins_in_query
def test_iterate_over_query() -> None: """ Creates a query with the new AST and iterate over all expressions. """ column1 = Column(None, "t1", "c1") column2 = Column(None, "t1", "c2") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2, )) condition = binary_condition(ConditionFunctions.EQ, column1, Literal(None, "1")) prewhere = binary_condition(ConditionFunctions.EQ, column2, Literal(None, "2")) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( Table("my_table", ColumnSet([])), selected_columns=[SelectedExpression("alias", function_1)], array_join=None, condition=condition, groupby=[function_1], prewhere=prewhere, having=None, order_by=[orderby], ) expected_expressions = [ # selected columns column1, column2, function_1, # condition column1, Literal(None, "1"), condition, # groupby column1, column2, function_1, # order by column2, function_2, # prewhere column2, Literal(None, "2"), prewhere, ] assert list(query.get_all_expressions()) == expected_expressions
def __get_unused_alias(self, query: Query) -> str: """ Get an unused alias to be used in the arrayJoin optimization. """ used_aliases = {exp.alias for exp in query.get_all_expressions()} alias_root = f"snuba_all_{self.column_name}" alias = alias_root index = 0 while alias in used_aliases: index += 1 alias = f"{alias_root}_{index}" return alias
def process_query(self, query: Query, request_settings: RequestSettings) -> None: arrayjoin_pattern = FunctionCall( String("arrayJoin"), (Column(column_name=Param( "col", Or([ String(key_column(self.__column_name)), String(val_column(self.__column_name)), ]), ), ), ), ) arrayjoins_in_query = set() for e in query.get_all_expressions(): match = arrayjoin_pattern.match(e) if match is not None: arrayjoins_in_query.add(match.string("col")) filtered_keys = [ LiteralExpr(None, key) for key in get_filtered_mapping_keys(query, self.__column_name) ] # Ensures the alias we apply to the arrayJoin is not already taken. used_aliases = {exp.alias for exp in query.get_all_expressions()} pair_alias_root = f"snuba_all_{self.__column_name}" pair_alias = pair_alias_root index = 0 while pair_alias in used_aliases: index += 1 pair_alias = f"{pair_alias_root}_{index}" def replace_expression(expr: Expression) -> Expression: """ Applies the appropriate optimization on a single arrayJoin expression. """ match = arrayjoin_pattern.match(expr) if match is None: return expr if arrayjoins_in_query == { key_column(self.__column_name), val_column(self.__column_name), }: # Both arrayJoin(col.key) and arrayJoin(col.value) expressions # present int the query. Do the arrayJoin on key-value pairs # instead of independent arrayjoin for keys and values. array_index = (LiteralExpr( None, 1) if match.string("col") == key_column( self.__column_name) else LiteralExpr(None, 2)) if not filtered_keys: return _unfiltered_mapping_pairs(expr.alias, self.__column_name, pair_alias, array_index) else: return _filtered_mapping_pairs( expr.alias, self.__column_name, pair_alias, filtered_keys, array_index, ) elif filtered_keys: # Only one between arrayJoin(col.key) and arrayJoin(col.value) # is present, and it is arrayJoin(col.key) since we found # filtered keys. return _filtered_mapping_keys(expr.alias, self.__column_name, filtered_keys) else: # No viable optimization return expr query.transform_expressions(replace_expression)
def test_replace_expression() -> None: """ Create a query with the new AST and replaces a function with a different function replaces f1(...) with tag(f1) """ column1 = Column(None, "t1", "c1") column2 = Column(None, "t1", "c2") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2,)) condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1")) prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2")) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( Table("my_table", ColumnSet([])), selected_columns=[SelectedExpression("alias", function_1)], array_join=None, condition=condition, groupby=[function_1], having=None, prewhere=prewhere, order_by=[orderby], ) def replace(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "f1": return FunctionCall(exp.alias, "tag", (Literal(None, "f1"),)) return exp query.transform_expressions(replace) expected_query = Query( Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression( "alias", FunctionCall("alias", "tag", (Literal(None, "f1"),)) ) ], array_join=None, condition=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"),)), Literal(None, "1"), ), groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"),))], prewhere=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"),)), Literal(None, "2"), ), having=None, order_by=[orderby], ) assert query.get_selected_columns() == expected_query.get_selected_columns() assert query.get_condition() == expected_query.get_condition() assert query.get_groupby() == expected_query.get_groupby() assert query.get_having() == expected_query.get_having() assert query.get_orderby() == expected_query.get_orderby() assert list(query.get_all_expressions()) == list( expected_query.get_all_expressions() )
def process_query(self, query: Query, query_settings: QuerySettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = self.__prewhere_candidates # We remove the candidates that appear in a uniq or -If aggregations # because a query like `countIf(col=x) .. PREWHERE col=x` can make # the Clickhouse server crash. uniq_cols: Set[str] = set() expressions = query.get_all_expressions() for exp in expressions: if isinstance(exp, FunctionCall) and (exp.function_name == "uniq" or exp.function_name.endswith("If")): columns = get_columns_in_expression(exp) for c in columns: uniq_cols.add(c.column_name) for col in uniq_cols: if col in prewhere_keys: metrics.increment( "uniq_col_in_prewhere_candidate", tags={ "column": col, "referrer": query_settings.referrer }, ) prewhere_keys = [key for key in prewhere_keys if key not in uniq_cols] # In case the query is final we cannot simply add any candidate # condition to the prewhere. # Final is applied after prewhere, so there are cases where moving # conditions to the prewhere could exclude from the result sets # rows that would be merged under the `final` condition. # Example, rewriting the group_id on an unmerge. If the group_id # is in the prewhere, final wil fail at merging the rows. # HACK: If query has final, do not move any condition on a column in the # omit_if_final list to prewhere. # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE # with Low Cardinality and Nullable columns. # https://github.com/ClickHouse/ClickHouse/issues/16171 if query.get_from_clause().final and self.__omit_if_final: prewhere_keys = [ key for key in prewhere_keys if key not in self.__omit_if_final ] if not prewhere_keys: return ast_condition = query.get_condition() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond)) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys), cond, ) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates ][:max_prewhere_conditions] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)