Example #1
0
    def __init__(self, query: Query, settings: RequestSettings,) -> None:
        # Clickhouse query structure
        # Referencing them here directly since it makes it easier
        # to process this query independently from the Clickhouse Query
        # and there is no risk in doing so since they are immutable.
        self.__selected_columns = query.get_selected_columns_from_ast()
        self.__condition = query.get_condition_from_ast()
        self.__groupby = query.get_groupby_from_ast()
        self.__having = query.get_having_from_ast()
        self.__orderby = query.get_orderby_from_ast()
        self.__data_source = query.get_data_source()
        self.__arrayjoin = query.get_arrayjoin_from_ast()
        self.__granularity = query.get_granularity()
        self.__limit = query.get_limit()
        self.__limitby = query.get_limitby()
        self.__offset = query.get_offset()

        if self.__having:
            assert self.__groupby, "found HAVING clause with no GROUP BY"

        self.__turbo = settings.get_turbo()
        self.__final = query.get_final()
        self.__sample = query.get_sample()
        self.__hastotals = query.has_totals()
        self.__prewhere = query.get_prewhere_ast()

        self.__settings = settings
        self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None
        self.__formatted_query: Optional[str] = None
        self.__sql_data: Optional[Mapping[str, str]] = None
Example #2
0
def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]:
    """
    Identifies the conditions we can apply the arrayFilter optimization
    on.
    Which means: if the arrayJoin is in the select clause, there
    are one or more top level AND condition on the arrayJoin and
    there is no OR condition in the query.
    """
    array_join_found = any(
        array_join_pattern(column_name).match(f) is not None
        for selected in query.get_selected_columns_from_ast() or []
        for f in selected.expression)

    if not array_join_found:
        return set()

    ast_condition = query.get_condition_from_ast()
    cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name)
                 if ast_condition is not None else set())
    if cond_keys is None:
        # This means we found an OR. Cowardly we give up even though there could
        # be cases where this condition is still optimizable.
        return set()

    ast_having = query.get_having_from_ast()
    having_keys = (_get_mapping_keys_in_condition(ast_having, column_name)
                   if ast_having is not None else set())
    if having_keys is None:
        # Same as above
        return set()

    return cond_keys | having_keys
Example #3
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        if not get_config(self.__killswitch, 1):
            return

        cond_class = ConditionClass.IRRELEVANT
        condition = query.get_condition_from_ast()
        if condition is not None:
            cond_class = self.__classify_combined_conditions(condition)
            if cond_class == ConditionClass.NOT_OPTIMIZABLE:
                return

        having_cond_class = ConditionClass.IRRELEVANT
        having_cond = query.get_having_from_ast()
        if having_cond is not None:
            having_cond_class = self.__classify_combined_conditions(
                having_cond)
            if having_cond_class == ConditionClass.NOT_OPTIMIZABLE:
                return

        if not (cond_class == ConditionClass.OPTIMIZABLE
                or having_cond_class == ConditionClass.OPTIMIZABLE):
            return

        metrics.increment("optimizable_query")

        if condition is not None:
            query.set_ast_condition(
                condition.transform(self.__replace_with_hash))
        if having_cond is not None:
            query.set_ast_having(
                having_cond.transform(self.__replace_with_hash))
def test_tags_processor(query_body: MutableMapping[str, Any],
                        expected_query: ClickhouseQuery) -> None:
    """
    Tests the whole processing in some notable cases.
    """
    processed = parse_and_process(query_body)
    assert (processed.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert processed.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
    assert processed.get_having_from_ast(
    ) == expected_query.get_having_from_ast()
Example #5
0
def test_translation(mappers: TranslationMappers, query: SnubaQuery,
                     expected: ClickhouseQuery) -> None:
    translated = QueryTranslator(mappers).translate(query)

    # TODO: consider providing an __eq__ method to the Query class. Or turn it into
    # a dataclass.
    assert (expected.get_selected_columns_from_ast() ==
            translated.get_selected_columns_from_ast())
    assert expected.get_groupby_from_ast() == translated.get_groupby_from_ast()
    assert expected.get_condition_from_ast(
    ) == translated.get_condition_from_ast()
    assert expected.get_arrayjoin_from_ast(
    ) == translated.get_arrayjoin_from_ast()
    assert expected.get_having_from_ast() == translated.get_having_from_ast()
    assert expected.get_orderby_from_ast() == translated.get_orderby_from_ast()
Example #6
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        conditions = query.get_conditions()
        if not conditions:
            return

        # Enable the processor only if we have enough data in the flattened
        # columns. Which have been deployed at BEGINNING_OF_TIME. If the query
        # starts earlier than that we do not apply the optimization.
        if self.__beginning_of_time:
            apply_optimization = False
            for condition in conditions:
                if (is_condition(condition) and isinstance(condition[0], str)
                        and condition[0] in self.__timestamp_cols
                        and condition[1] in (">=", ">")
                        and isinstance(condition[2], str)):
                    try:
                        start_ts = parse_datetime(condition[2])
                        if (start_ts -
                                self.__beginning_of_time).total_seconds() > 0:
                            apply_optimization = True
                    except Exception:
                        # We should not get here, it means the from timestamp is malformed
                        # Returning here is just for safety
                        logger.error(
                            "Cannot parse start date for NestedFieldOptimizer: %r",
                            condition,
                        )
                        return
            if not apply_optimization:
                return

        # Do not use flattened tags if tags are being unpacked anyway. In that case
        # using flattened tags only implies loading an additional column thus making
        # the query heavier and slower
        if self.__has_tags(query.get_arrayjoin_from_ast()):
            return
        if query.get_groupby_from_ast():
            for expression in query.get_groupby_from_ast():
                if self.__has_tags(expression):
                    return
        if self.__has_tags(query.get_having_from_ast()):
            return

        if query.get_orderby_from_ast():
            for orderby in query.get_orderby_from_ast():
                if self.__has_tags(orderby.expression):
                    return

        new_conditions = []
        positive_like_expression: List[str] = []
        negative_like_expression: List[str] = []

        for c in conditions:
            keyvalue = self.__is_optimizable(c, self.__nested_col)
            if not keyvalue:
                new_conditions.append(c)
            else:
                expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}"
                if keyvalue.operand == Operand.EQ:
                    positive_like_expression.append(expression)
                else:
                    negative_like_expression.append(expression)

        if positive_like_expression:
            # Positive conditions "=" are all merged together in one LIKE expression
            positive_like_expression = sorted(positive_like_expression)
            like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%"
            new_conditions.append(
                [self.__flattened_col, "LIKE", like_formatted])

        for expression in negative_like_expression:
            # Negative conditions "!=" cannot be merged together. We can still transform
            # them into NOT LIKE statements, but each condition has to be one
            # statement.
            not_like_formatted = f"%|{expression}|%"
            new_conditions.append(
                [self.__flattened_col, "NOT LIKE", not_like_formatted])

        query.set_conditions(new_conditions)
Example #7
0
def test_replace_expression() -> None:
    """
    Create a query with the new AST and replaces a function with a different function
    replaces f1(...) with tag(f1)
    """
    column1 = Column(None, "t1", "c1")
    column2 = Column(None, "t1", "c2")
    function_1 = FunctionCall("alias", "f1", (column1, column2))
    function_2 = FunctionCall("alias", "f2", (column2, ))

    condition = binary_condition(ConditionFunctions.EQ, function_1,
                                 Literal(None, "1"))

    prewhere = binary_condition(ConditionFunctions.EQ, function_1,
                                Literal(None, "2"))

    orderby = OrderBy(OrderByDirection.ASC, function_2)

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[SelectedExpression("alias", function_1)],
        array_join=None,
        condition=condition,
        groupby=[function_1],
        having=None,
        prewhere=prewhere,
        order_by=[orderby],
    )

    def replace(exp: Expression) -> Expression:
        if isinstance(exp, FunctionCall) and exp.function_name == "f1":
            return FunctionCall(exp.alias, "tag", (Literal(None, "f1"), ))
        return exp

    query.transform_expressions(replace)

    expected_query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "alias", FunctionCall("alias", "tag", (Literal(None, "f1"), )))
        ],
        array_join=None,
        condition=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"), )),
            Literal(None, "1"),
        ),
        groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"), ))],
        prewhere=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"), )),
            Literal(None, "2"),
        ),
        having=None,
        order_by=[orderby],
    )

    assert (query.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert query.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
    assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast(
    )
    assert query.get_having_from_ast() == expected_query.get_having_from_ast()
    assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast(
    )

    assert list(query.get_all_expressions()) == list(
        expected_query.get_all_expressions())