Ejemplo n.º 1
0
def _list_array_join(query: Query) -> Columnset:
    ret = set()
    query_arrayjoin = query.get_arrayjoin_from_ast()
    if query_arrayjoin is not None:
        ret |= _get_columns_from_expression(query_arrayjoin)

    for e in query.get_all_expressions():
        if isinstance(e, FunctionCallExpr) and e.function_name == "arrayJoin":
            ret |= _get_columns_from_expression(e)

    return ret
Ejemplo n.º 2
0
    def __get_array_joins_in_query(self, query: Query) -> Set[str]:
        """
        Get all of the arrayJoins on the possible columns that are present in the query.
        """
        array_joins_in_query: Set[str] = set()

        for e in query.get_all_expressions():
            match = self.__array_join_pattern.match(e)
            if match is not None:
                array_joins_in_query.add(match.string("col"))

        return array_joins_in_query
Ejemplo n.º 3
0
def test_iterate_over_query() -> None:
    """
    Creates a query with the new AST and iterate over all expressions.
    """
    column1 = Column(None, "t1", "c1")
    column2 = Column(None, "t1", "c2")
    function_1 = FunctionCall("alias", "f1", (column1, column2))
    function_2 = FunctionCall("alias", "f2", (column2, ))

    condition = binary_condition(ConditionFunctions.EQ, column1,
                                 Literal(None, "1"))

    prewhere = binary_condition(ConditionFunctions.EQ, column2,
                                Literal(None, "2"))

    orderby = OrderBy(OrderByDirection.ASC, function_2)

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[SelectedExpression("alias", function_1)],
        array_join=None,
        condition=condition,
        groupby=[function_1],
        prewhere=prewhere,
        having=None,
        order_by=[orderby],
    )

    expected_expressions = [
        # selected columns
        column1,
        column2,
        function_1,
        # condition
        column1,
        Literal(None, "1"),
        condition,
        # groupby
        column1,
        column2,
        function_1,
        # order by
        column2,
        function_2,
        # prewhere
        column2,
        Literal(None, "2"),
        prewhere,
    ]

    assert list(query.get_all_expressions()) == expected_expressions
Ejemplo n.º 4
0
    def __get_unused_alias(self, query: Query) -> str:
        """
        Get an unused alias to be used in the arrayJoin optimization.
        """
        used_aliases = {exp.alias for exp in query.get_all_expressions()}
        alias_root = f"snuba_all_{self.column_name}"
        alias = alias_root
        index = 0

        while alias in used_aliases:
            index += 1
            alias = f"{alias_root}_{index}"

        return alias
Ejemplo n.º 5
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        arrayjoin_pattern = FunctionCall(
            String("arrayJoin"),
            (Column(column_name=Param(
                "col",
                Or([
                    String(key_column(self.__column_name)),
                    String(val_column(self.__column_name)),
                ]),
            ), ), ),
        )

        arrayjoins_in_query = set()
        for e in query.get_all_expressions():
            match = arrayjoin_pattern.match(e)
            if match is not None:
                arrayjoins_in_query.add(match.string("col"))

        filtered_keys = [
            LiteralExpr(None, key)
            for key in get_filtered_mapping_keys(query, self.__column_name)
        ]

        # Ensures the alias we apply to the arrayJoin is not already taken.
        used_aliases = {exp.alias for exp in query.get_all_expressions()}
        pair_alias_root = f"snuba_all_{self.__column_name}"
        pair_alias = pair_alias_root
        index = 0
        while pair_alias in used_aliases:
            index += 1
            pair_alias = f"{pair_alias_root}_{index}"

        def replace_expression(expr: Expression) -> Expression:
            """
            Applies the appropriate optimization on a single arrayJoin expression.
            """
            match = arrayjoin_pattern.match(expr)
            if match is None:
                return expr

            if arrayjoins_in_query == {
                    key_column(self.__column_name),
                    val_column(self.__column_name),
            }:
                # Both arrayJoin(col.key) and arrayJoin(col.value) expressions
                # present int the query. Do the arrayJoin on key-value pairs
                # instead of independent arrayjoin for keys and values.
                array_index = (LiteralExpr(
                    None, 1) if match.string("col") == key_column(
                        self.__column_name) else LiteralExpr(None, 2))

                if not filtered_keys:
                    return _unfiltered_mapping_pairs(expr.alias,
                                                     self.__column_name,
                                                     pair_alias, array_index)
                else:
                    return _filtered_mapping_pairs(
                        expr.alias,
                        self.__column_name,
                        pair_alias,
                        filtered_keys,
                        array_index,
                    )

            elif filtered_keys:
                # Only one between arrayJoin(col.key) and arrayJoin(col.value)
                # is present, and it is arrayJoin(col.key) since we found
                # filtered keys.
                return _filtered_mapping_keys(expr.alias, self.__column_name,
                                              filtered_keys)
            else:
                # No viable optimization
                return expr

        query.transform_expressions(replace_expression)
Ejemplo n.º 6
0
def test_replace_expression() -> None:
    """
    Create a query with the new AST and replaces a function with a different function
    replaces f1(...) with tag(f1)
    """
    column1 = Column(None, "t1", "c1")
    column2 = Column(None, "t1", "c2")
    function_1 = FunctionCall("alias", "f1", (column1, column2))
    function_2 = FunctionCall("alias", "f2", (column2,))

    condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1"))

    prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2"))

    orderby = OrderBy(OrderByDirection.ASC, function_2)

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[SelectedExpression("alias", function_1)],
        array_join=None,
        condition=condition,
        groupby=[function_1],
        having=None,
        prewhere=prewhere,
        order_by=[orderby],
    )

    def replace(exp: Expression) -> Expression:
        if isinstance(exp, FunctionCall) and exp.function_name == "f1":
            return FunctionCall(exp.alias, "tag", (Literal(None, "f1"),))
        return exp

    query.transform_expressions(replace)

    expected_query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "alias", FunctionCall("alias", "tag", (Literal(None, "f1"),))
            )
        ],
        array_join=None,
        condition=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"),)),
            Literal(None, "1"),
        ),
        groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"),))],
        prewhere=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"),)),
            Literal(None, "2"),
        ),
        having=None,
        order_by=[orderby],
    )

    assert query.get_selected_columns() == expected_query.get_selected_columns()
    assert query.get_condition() == expected_query.get_condition()
    assert query.get_groupby() == expected_query.get_groupby()
    assert query.get_having() == expected_query.get_having()
    assert query.get_orderby() == expected_query.get_orderby()

    assert list(query.get_all_expressions()) == list(
        expected_query.get_all_expressions()
    )
Ejemplo n.º 7
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # We remove the candidates that appear in a uniq or -If aggregations
        # because a query like `countIf(col=x) .. PREWHERE col=x` can make
        # the Clickhouse server crash.
        uniq_cols: Set[str] = set()
        expressions = query.get_all_expressions()
        for exp in expressions:
            if isinstance(exp,
                          FunctionCall) and (exp.function_name == "uniq" or
                                             exp.function_name.endswith("If")):
                columns = get_columns_in_expression(exp)
                for c in columns:
                    uniq_cols.add(c.column_name)

        for col in uniq_cols:
            if col in prewhere_keys:
                metrics.increment(
                    "uniq_col_in_prewhere_candidate",
                    tags={
                        "column": col,
                        "referrer": query_settings.referrer
                    },
                )

        prewhere_keys = [key for key in prewhere_keys if key not in uniq_cols]

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)