Example #1
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Example #2
0
    def verify_expressions(top_level: Expression,
                           expected: Expression) -> bool:
        actual_conds = get_first_level_and_conditions(top_level)
        expected_conds = get_first_level_and_conditions(expected)
        for cond in expected_conds:
            if cond not in actual_conds:
                return False

        return True
Example #3
0
    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (
            self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS
        )
        prewhere_keys = query.get_from_clause().prewhere_candidates
        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS
            and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond)
            )
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [
                (
                    min(
                        prewhere_keys.index(col.column_name)
                        for col in cols
                        if col.column_name in prewhere_keys
                    ),
                    cond,
                )
                for cols, cond in prewhere_candidates
            ],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates][
            :max_prewhere_conditions
        ]

        new_conditions = [
            cond
            for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None
        )
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions) if prewhere_conditions else None
        )
Example #4
0
def test_first_level_conditions() -> None:
    c1 = binary_condition(
        ConditionFunctions.EQ,
        Column(None, "table1", "column1"),
        Literal(None, "test"),
    )
    c2 = binary_condition(
        ConditionFunctions.EQ,
        Column(None, "table2", "column2"),
        Literal(None, "test"),
    )
    c3 = binary_condition(
        ConditionFunctions.EQ,
        Column(None, "table3", "column3"),
        Literal(None, "test"),
    )

    cond = binary_condition(
        BooleanFunctions.AND,
        binary_condition(BooleanFunctions.AND, c1, c2),
        c3,
    )
    assert get_first_level_and_conditions(cond) == [c1, c2, c3]

    cond = binary_condition(
        BooleanFunctions.AND,
        FunctionCall(None, "equals",
                     (FunctionCall(None, "and", (c1, c2)), Literal(None, 1))),
        c3,
    )
    assert get_first_level_and_conditions(cond) == [c1, c2, c3]

    cond = binary_condition(
        BooleanFunctions.OR,
        binary_condition(BooleanFunctions.AND, c1, c2),
        c3,
    )
    assert get_first_level_or_conditions(cond) == [
        binary_condition(BooleanFunctions.AND, c1, c2),
        c3,
    ]

    cond = binary_condition(
        ConditionFunctions.EQ,
        binary_condition(BooleanFunctions.OR, c1,
                         binary_condition(BooleanFunctions.AND, c2, c3)),
        Literal(None, 1),
    )
    assert get_first_level_or_conditions(cond) == [
        c1,
        binary_condition(BooleanFunctions.AND, c2, c3),
    ]
Example #5
0
def _replace_time_condition(
    query: Union[CompositeQuery[QueryEntity], LogicalQuery]
) -> None:
    condition = query.get_condition()
    top_level = (
        get_first_level_and_conditions(condition) if condition is not None else []
    )
    max_days, date_align = state.get_configs(
        [("max_days", None), ("date_align_seconds", 1)]
    )
    assert isinstance(date_align, int)
    if max_days is not None:
        max_days = int(max_days)

    if isinstance(query, LogicalQuery):
        new_top_level = _align_max_days_date_align(
            query.get_from_clause().key, top_level, max_days, date_align
        )
        query.set_ast_condition(combine_and_conditions(new_top_level))
    else:
        from_clause = query.get_from_clause()
        if not isinstance(from_clause, JoinClause):
            return

        alias_map = from_clause.get_alias_node_map()
        for alias, node in alias_map.items():
            assert isinstance(node.data_source, QueryEntity)  # mypy
            new_top_level = _align_max_days_date_align(
                node.data_source.key, top_level, max_days, date_align, alias
            )
            top_level = new_top_level
            query.set_ast_condition(combine_and_conditions(new_top_level))
Example #6
0
 def inspect_expression(condition: Expression) -> None:
     top_level = get_first_level_and_conditions(condition)
     for condition in top_level:
         for checker in self.__condition_checkers:
             if checker in missing_checkers:
                 if checker.check(condition):
                     missing_checkers.remove(checker)
def get_mapping_keys_in_condition(
    conditions: Expression,
    extractors: Sequence[Extractor[T]],
    is_skippable_condition: Callable[[Expression], bool],
) -> Optional[Set[T]]:
    """
    Examines the top level AND conditions and applies the extractor functions to
    extract the matching keys.

    If any we find any OR conditions, we exit early though there could be possible
    optimizations to be done in these situations.
    """
    keys_found: Set[T] = set()

    for c in get_first_level_and_conditions(conditions):
        if is_skippable_condition(c):
            continue

        if is_any_binary_condition(c, BooleanFunctions.OR):
            return None

        for extractor in extractors:
            keys_found |= extractor(c)

    return keys_found
Example #8
0
def _replace_ast_condition(
    query: Query, field: str, operator: str, new_operand: Expression
) -> None:
    """
    Replaces a condition in the top level AND boolean condition
    in the query WHERE clause.
    """

    def replace_condition(expression: Expression) -> Expression:
        match = FunctionCall(
            String(OPERATOR_TO_FUNCTION[operator]),
            (Param("column", Column(None, String(field))), AnyExpression()),
        ).match(expression)

        return (
            expression
            if match is None
            else replace(
                expression, parameters=(match.expression("column"), new_operand)
            )
        )

    condition = query.get_condition_from_ast()
    if condition is not None:
        query.set_ast_condition(
            combine_and_conditions(
                [
                    replace_condition(c)
                    for c in get_first_level_and_conditions(condition)
                ]
            )
        )
Example #9
0
    def visit_function_call(self, exp: FunctionCall) -> str:
        if exp.function_name == "array":
            # Workaround for https://github.com/ClickHouse/ClickHouse/issues/11622
            # Some distributed queries fail when arrays are passed as array(1,2,3)
            # and work when they are passed as [1, 2, 3]
            return self._alias(f"[{self.__visit_params(exp.parameters)}]",
                               exp.alias)

        if exp.function_name == "tuple" and len(exp.parameters) > 1:
            # Some distributed queries fail when tuples are passed as tuple(1,2,3)
            # and work when they are passed as (1, 2, 3)
            # to be safe, only do this for when a tuple has more than one element otherwise clickhouse
            # will interpret (1) -> 1 which will break things like 1 IN tuple(1)
            return self._alias(f"({self.__visit_params(exp.parameters)})",
                               exp.alias)

        elif exp.function_name == BooleanFunctions.AND:
            formatted = (c.accept(self)
                         for c in get_first_level_and_conditions(exp))
            return " AND ".join(formatted)

        elif exp.function_name == BooleanFunctions.OR:
            formatted = (c.accept(self)
                         for c in get_first_level_or_conditions(exp))
            return f"({' OR '.join(formatted)})"

        ret = f"{escape_identifier(exp.function_name)}({self.__visit_params(exp.parameters)})"
        return self._alias(ret, exp.alias)
Example #10
0
 def validate(self, query: Query, alias: Optional[str] = None) -> None:
     condition = query.get_condition()
     top_level = get_first_level_and_conditions(
         condition) if condition else []
     for cond in top_level:
         if self.match.match(cond):
             raise InvalidExpressionException(
                 cond,
                 f"Cannot have existing conditions on time field {self.required_time_column}",
                 report=False,
             )
Example #11
0
 def _get_prewhere_candidates(
     self, query: Query, prewhere_keys: Sequence[str]
 ) -> Sequence[Tuple[Iterable[Column], Expression]]:
     # Add any condition to PREWHERE if:
     # - It is a single top-level condition (not OR-nested), and
     # - Any of its referenced columns are in prewhere_keys
     ast_condition = query.get_condition_from_ast()
     return ([(get_columns_in_expression(cond), cond)
              for cond in get_first_level_and_conditions(ast_condition)
              if isinstance(cond, FunctionCall)
              and cond.function_name in self.allowed_ast_operators and any(
                  col.column_name in prewhere_keys
                  for col in get_columns_in_expression(cond))]
             if ast_condition is not None else [])
Example #12
0
    def validate(self, query: Query, alias: Optional[str] = None) -> None:
        condition = query.get_condition()
        top_level = get_first_level_and_conditions(
            condition) if condition else []

        missing = set()
        if self.required_columns:
            for col in self.required_columns:
                match = build_match(col, [ConditionFunctions.EQ], int, alias)
                found = any(match.match(cond) for cond in top_level)
                if not found:
                    missing.add(col)

        if missing:
            raise InvalidQueryException(
                f"missing required conditions for {', '.join(missing)}")
Example #13
0
    def _update_conditions(self, query: Query,
                           prewhere_conditions: Sequence[Expression]) -> None:
        ast_condition = query.get_condition_from_ast()
        # This should never be None at this point, but for mypy this can be None.
        assert ast_condition is not None

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Example #14
0
    def visit_function_call(self, exp: FunctionCall) -> str:
        if exp.function_name == "array":
            # Workaround for https://github.com/ClickHouse/ClickHouse/issues/11622
            # Some distributed queries fail when arrays are passed as array(1,2,3)
            # and work when they are passed as [1, 2, 3]
            return self._alias(f"[{self.__visit_params(exp.parameters)}]", exp.alias)

        elif exp.function_name == BooleanFunctions.AND:
            formatted = (c.accept(self) for c in get_first_level_and_conditions(exp))
            return " AND ".join(formatted)

        elif exp.function_name == BooleanFunctions.OR:
            formatted = (c.accept(self) for c in get_first_level_or_conditions(exp))
            return f"({' OR '.join(formatted)})"

        ret = f"{escape_identifier(exp.function_name)}({self.__visit_params(exp.parameters)})"
        return self._alias(ret, exp.alias)
Example #15
0
 def __classify_combined_conditions(self, condition: Expression) -> ConditionClass:
     if not isinstance(condition, FunctionExpr):
         return ConditionClass.IRRELEVANT
     elif condition.function_name in (BooleanFunctions.AND, BooleanFunctions.OR):
         conditions = (
             get_first_level_and_conditions(condition)
             if condition.function_name == BooleanFunctions.AND
             else get_first_level_or_conditions(condition)
         )
         classified = {self.__classify_combined_conditions(c) for c in conditions}
         if ConditionClass.NOT_OPTIMIZABLE in classified:
             return ConditionClass.NOT_OPTIMIZABLE
         elif ConditionClass.OPTIMIZABLE in classified:
             return ConditionClass.OPTIMIZABLE
         else:
             return ConditionClass.IRRELEVANT
     else:
         return self.__classify_condition(condition)
def _get_mapping_keys_in_condition(
    condition: Expression, column_name: str
) -> Optional[Set[str]]:
    """
    Finds the top level conditions that include filter based on the arrayJoin.
    This is meant to be used to find the keys the query is filtering the arrayJoin
    on.
    We can only apply the arrayFilter optimization to arrayJoin conditions
    that are not in OR with other columns. To simplify the problem, we only
    consider those conditions that are included in the first level of the query:
    [['tagskey' '=' 'a'],['col' '=' 'b'],['col2' '=' 'c']]  works
    [[['tagskey' '=' 'a'], ['col2' '=' 'b']], ['tagskey' '=' 'c']] does not

    If we encounter an OR condition we return None, which means we cannot
    safely apply the optimization. Empty set means we did not find any
    suitable arrayJoin for optimization in this condition but that does
    not disqualify the whole query in the way the OR condition does.
    """
    keys_found = set()

    conditions = get_first_level_and_conditions(condition)
    for c in conditions:
        if is_binary_condition(c, BooleanFunctions.OR):
            return None

        match = FunctionCall(
            None,
            String(ConditionFunctions.EQ),
            (array_join_pattern(column_name), Literal(None, Param("key", Any(str)))),
        ).match(c)
        if match is not None:
            keys_found.add(match.string("key"))

        match = is_in_condition_pattern(array_join_pattern(column_name)).match(c)
        if match is not None:
            function = match.expression("tuple")
            assert isinstance(function, FunctionCallExpr)
            keys_found |= {
                lit.value
                for lit in function.parameters
                if isinstance(lit, LiteralExpr) and isinstance(lit.value, str)
            }

    return keys_found
Example #17
0
def get_time_range(
        query: Query,
        timestamp_field: str) -> Tuple[Optional[datetime], Optional[datetime]]:
    """
    Finds the minimal time range for this query. Which means, it finds
    the >= timestamp condition with the highest datetime literal and
    the < timestamp condition with the smallest and returns the interval
    in the form of a tuple of Literals. It only looks into first level
    AND conditions since, if the timestamp is nested in an OR we cannot
    say anything on how that compares to the other timestamp conditions.
    """

    condition_clause = query.get_condition_from_ast()
    if not condition_clause:
        return (None, None)

    max_lower_bound = None
    min_upper_bound = None
    for c in get_first_level_and_conditions(condition_clause):
        match = FunctionCall(
            None,
            Param(
                "operator",
                Or([
                    String(OPERATOR_TO_FUNCTION[">="]),
                    String(OPERATOR_TO_FUNCTION["<"]),
                ]),
            ),
            (
                Column(None, None, String(timestamp_field)),
                Literal(None, Param("timestamp", Any(datetime))),
            ),
        ).match(c)

        if match is not None:
            timestamp = cast(datetime, match.scalar("timestamp"))
            if match.string("operator") == OPERATOR_TO_FUNCTION[">="]:
                if not max_lower_bound or timestamp > max_lower_bound:
                    max_lower_bound = timestamp
            else:
                if not min_upper_bound or timestamp < min_upper_bound:
                    min_upper_bound = timestamp

    return (max_lower_bound, min_upper_bound)
Example #18
0
def get_time_range(
        query: ProcessableQuery[Table],
        timestamp_field: str) -> Tuple[Optional[datetime], Optional[datetime]]:
    """
    Finds the minimal time range for this query. Which means, it finds
    the >= timestamp condition with the highest datetime literal and
    the < timestamp condition with the smallest and returns the interval
    in the form of a tuple of Literals. It only looks into first level
    AND conditions since, if the timestamp is nested in an OR we cannot
    say anything on how that compares to the other timestamp conditions.
    """

    condition_clause = query.get_condition_from_ast()
    if not condition_clause:
        return (None, None)

    lower, upper = get_time_range_expressions(
        get_first_level_and_conditions(condition_clause), timestamp_field)
    lower_bound = lower[0] if lower else None
    upper_bound = upper[0] if upper else None
    return lower_bound, upper_bound
Example #19
0
    def _replace_time_condition(
        self,
        query: Query,
        from_date: datetime,
        from_exp: FunctionCall,
        to_date: datetime,
        to_exp: FunctionCall,
    ) -> None:
        max_days, date_align = state.get_configs(
            [("max_days", None), ("date_align_seconds", 1)]
        )

        def align_fn(dt: datetime) -> datetime:
            assert isinstance(date_align, int)
            return dt - timedelta(seconds=(dt - dt.min).seconds % date_align)

        from_date, to_date = align_fn(from_date), align_fn(to_date)
        assert from_date <= to_date

        if max_days is not None and (to_date - from_date).days > max_days:
            from_date = to_date - timedelta(days=max_days)

        def replace_cond(exp: Expression) -> Expression:
            if not isinstance(exp, FunctionCall):
                return exp
            elif exp == from_exp:
                return replace(
                    exp, parameters=(from_exp.parameters[0], Literal(None, from_date)),
                )
            elif exp == to_exp:
                return replace(
                    exp, parameters=(to_exp.parameters[0], Literal(None, to_date))
                )

            return exp

        condition = query.get_condition_from_ast()
        top_level = get_first_level_and_conditions(condition) if condition else []
        new_top_level = list(map(replace_cond, top_level))
        query.set_ast_condition(combine_and_conditions(new_top_level))
Example #20
0
def test_first_level_conditions() -> None:
    c1 = binary_condition(
        None,
        ConditionFunctions.EQ,
        Column(None, "table1", "column1"),
        Literal(None, "test"),
    )
    c2 = binary_condition(
        None,
        ConditionFunctions.EQ,
        Column(None, "table2", "column2"),
        Literal(None, "test"),
    )
    c3 = binary_condition(
        None,
        ConditionFunctions.EQ,
        Column(None, "table3", "column3"),
        Literal(None, "test"),
    )

    cond = binary_condition(
        None,
        BooleanFunctions.AND,
        binary_condition(None, BooleanFunctions.AND, c1, c2),
        c3,
    )
    assert get_first_level_and_conditions(cond) == [c1, c2, c3]

    cond = binary_condition(
        None,
        BooleanFunctions.OR,
        binary_condition(None, BooleanFunctions.AND, c1, c2),
        c3,
    )
    assert get_first_level_or_conditions(cond) == [
        binary_condition(None, BooleanFunctions.AND, c1, c2),
        c3,
    ]
Example #21
0
    def _validate_groupby_fields_have_matching_conditions(
            query: Query, alias: Optional[str] = None) -> None:
        """
        Method that insures that for every field in the group by clause, there should be a
        matching a condition. For example, if we had in our groupby clause [project_id, tags[3]],
        we should have the following conditions in the where clause `project_id = 3 AND tags[3]
        IN array(1,2,3)`. This is necessary because we want to avoid the case where an
        unspecified number of buckets is returned.
        """
        condition = query.get_condition()
        top_level = get_first_level_and_conditions(
            condition) if condition else []
        for exp in query.get_groupby():
            key: Optional[str] = None
            if isinstance(exp, SubscriptableReferenceExpr):
                column_name = str(exp.column.column_name)
                key = str(exp.key.value)
            elif isinstance(exp, Column):
                column_name = exp.column_name
            else:
                raise InvalidQueryException(
                    "Unhandled column type in group by validation")

            match = build_match(
                col=column_name,
                ops=[ConditionFunctions.EQ],
                param_type=int,
                alias=alias,
                key=key,
            )
            found = any(match.match(cond) for cond in top_level)

            if not found:
                raise InvalidQueryException(
                    f"Every field in groupby must have a corresponding condition in "
                    f"where clause. missing condition for field {exp}")
Example #22
0
def test_tags_expander() -> None:
    query_body = """
    MATCH (events)
    SELECT count(platform) AS platforms, testF(platform, tags_value) AS top_platforms, f1(tags_key, column2) AS f1_alias, f2() AS f2_alias
    WHERE tags_key = 'tags_key'
    AND project_id = 1
    AND timestamp >= toDateTime('2020-01-01 12:00:00')
    AND timestamp < toDateTime('2020-01-02 12:00:00')
    HAVING tags_value IN tuple('tag')
    """

    events = get_dataset("events")
    query, _ = parse_snql_query(query_body, events)

    processor = TagsExpanderProcessor()
    query_settings = HTTPQuerySettings()
    processor.process_query(query, query_settings)

    assert query.get_selected_columns() == [
        SelectedExpression(
            "platforms",
            FunctionCall(
                "_snuba_platforms",
                "count",
                (Column("_snuba_platform", None, "platform"), ),
            ),
        ),
        SelectedExpression(
            "top_platforms",
            FunctionCall(
                "_snuba_top_platforms",
                "testF",
                (
                    Column("_snuba_platform", None, "platform"),
                    FunctionCall(
                        "_snuba_tags_value",
                        "arrayJoin",
                        (Column(None, None, "tags.value"), ),
                    ),
                ),
            ),
        ),
        SelectedExpression(
            "f1_alias",
            FunctionCall(
                "_snuba_f1_alias",
                "f1",
                (
                    FunctionCall(
                        "_snuba_tags_key",
                        "arrayJoin",
                        (Column(None, None, "tags.key"), ),
                    ),
                    Column("_snuba_column2", None, "column2"),
                ),
            ),
        ),
        SelectedExpression("f2_alias",
                           FunctionCall("_snuba_f2_alias", "f2", tuple())),
    ]

    condition = query.get_condition()
    assert condition is not None
    conds = get_first_level_and_conditions(condition)
    assert conds[0] == binary_condition(
        OPERATOR_TO_FUNCTION["="],
        FunctionCall("_snuba_tags_key", "arrayJoin",
                     (Column(None, None, "tags.key"), )),
        Literal(None, "tags_key"),
    )

    assert query.get_having() == in_condition(
        FunctionCall("_snuba_tags_value", "arrayJoin",
                     (Column(None, None, "tags.value"), )),
        [Literal(None, "tag")],
    )
Example #23
0
def match_query_to_entity(
    query: Query,
    events_only_columns: ColumnSet,
    transactions_only_columns: ColumnSet,
) -> EntityKey:
    # First check for a top level condition on the event type
    condition = query.get_condition_from_ast()
    event_types = set()
    if condition:
        top_level_condition = get_first_level_and_conditions(condition)

        for cond in top_level_condition:
            result = EVENT_CONDITION.match(cond)
            if not result:
                continue

            event_type_param = result.expression("event_type")

            if isinstance(event_type_param, Column):
                event_type = event_type_param.column_name
            elif isinstance(event_type_param, Literal):
                event_type = str(event_type_param.value)
            if result:
                if result.string("function") == ConditionFunctions.EQ:
                    event_types.add(event_type)
                elif result.string("function") == ConditionFunctions.NEQ:
                    if event_type == "transaction":
                        return EVENTS

    if len(event_types) == 1 and "transaction" in event_types:
        return TRANSACTIONS

    if len(event_types) > 0 and "transaction" not in event_types:
        return EVENTS

    # If we cannot clearly pick an entity from the top level conditions, then
    # inspect the columns requested to infer a selection.
    has_event_columns = False
    has_transaction_columns = False
    for col in query.get_all_ast_referenced_columns():
        if events_only_columns.get(col.column_name):
            has_event_columns = True
        elif transactions_only_columns.get(col.column_name):
            has_transaction_columns = True

    for subscript in query.get_all_ast_referenced_subscripts():
        # Subscriptable references will not be properly recognized above
        # through get_all_ast_referenced_columns since the columns that
        # method will find will look like `tags` or `measurements`, while
        # the column sets contains `tags.key` and `tags.value`.
        schema_col_name = subscript_key_column_name(subscript)
        if events_only_columns.get(schema_col_name):
            has_event_columns = True
        if transactions_only_columns.get(schema_col_name):
            has_transaction_columns = True

    # Check for isHandled/notHandled
    if has_event_columns is False:
        for expr in query.get_all_expressions():
            match = EVENT_FUNCTIONS.match(expr)
            if match:
                has_event_columns = True

    # Check for apdex or failure rate
    if has_transaction_columns is False:
        for expr in query.get_all_expressions():
            match = TRANSACTION_FUNCTIONS.match(expr)
            if match:
                has_transaction_columns = True

    if has_event_columns and has_transaction_columns:
        # Impossible query, use the merge table
        return EVENTS_AND_TRANSACTIONS
    elif has_event_columns:
        return EVENTS
    elif has_transaction_columns:
        return TRANSACTIONS
    else:
        return EVENTS_AND_TRANSACTIONS
Example #24
0
def generate_subqueries(query: CompositeQuery[Entity]) -> None:
    """
    Generates correct subqueries for each of the entities referenced in
    a join query, and pushes down all expressions that can be executed
    in the subquery.

    Columns in the select clause of the subqueries are referenced
    by providing them a mangled alias that is referenced in the external
    query.

    ```
    SELECT e.a, f(g.b) FROM Events e INNER JOIN Groups g ON ...
    ```

    becomes

    ```
    SELECT e._snuba_a, g._snuba_b
    FROM (
        SELECT a as _snuba_a
        FROM events
    ) e INNER JOIN (
        SELECT f(b) as _snuba_b
        FROM groups
    ) g ON ....
    ```

    Conditions are treated differently compared to other expressions. If
    a condition is entirely contained in a single subquery, we push it
    down entirely in the condition clause of the subquery and remove it
    from the main query entirely.
    """

    from_clause = query.get_from_clause()
    if isinstance(from_clause, CompositeQuery):
        generate_subqueries(from_clause)
        return
    elif isinstance(from_clause, ProcessableQuery):
        return

    # Now this has to be a join, so we can work with it.
    subqueries = from_clause.accept(SubqueriesInitializer())

    alias_generator = _alias_generator()
    query.set_ast_selected_columns([
        SelectedExpression(
            name=s.name,
            expression=_process_root(s.expression, subqueries,
                                     alias_generator),
        ) for s in query.get_selected_columns()
    ])

    array_join = query.get_arrayjoin()
    if array_join is not None:
        query.set_arrayjoin([
            _process_root(el, subqueries, alias_generator) for el in array_join
        ])

    ast_condition = query.get_condition()
    if ast_condition is not None:
        main_conditions = []
        for c in get_first_level_and_conditions(ast_condition):
            subexpression = c.accept(BranchCutter(alias_generator))
            if isinstance(subexpression, SubqueryExpression):
                # The expression is entirely contained in a single subquery
                # after we tried to cut subquery branches with the
                # BranchCutter visitor.
                # so push down the entire condition and remove it from
                # the main query.
                subqueries[subexpression.subquery_alias].add_condition(
                    subexpression.main_expression)
            else:
                # This condition has references to multiple subqueries.
                # We cannot push down the condition. We push down the
                # branches into the select clauses and we reference them
                # from the main query condition.
                main_conditions.append(
                    _push_down_branches(subexpression, subqueries,
                                        alias_generator))

        if main_conditions:
            query.set_ast_condition(combine_and_conditions(main_conditions))
        else:
            query.set_ast_condition(None)

    # TODO: push down the group by when it is the same as the join key.
    query.set_ast_groupby([
        _process_root(e, subqueries, alias_generator)
        for e in query.get_groupby()
    ])

    having = query.get_having()
    if having is not None:
        query.set_ast_having(
            combine_and_conditions([
                _process_root(c, subqueries, alias_generator)
                for c in get_first_level_and_conditions(having)
            ]))

    query.set_ast_orderby([
        replace(
            orderby,
            expression=_process_root(orderby.expression, subqueries,
                                     alias_generator),
        ) for orderby in query.get_orderby()
    ])

    limitby = query.get_limitby()
    if limitby is not None:
        query.set_limitby(
            replace(
                limitby,
                columns=[
                    _process_root(
                        column,
                        subqueries,
                        alias_generator,
                    ) for column in limitby.columns
                ],
            ))

    query.set_from_clause(
        SubqueriesReplacer(subqueries).visit_join_clause(from_clause))
Example #25
0
def add_equivalent_conditions(query: CompositeQuery[Entity]) -> None:
    """
    Finds conditions in a join query on columns that have a semantic
    equivalent in another entity in the join and add the same condition
    on the equivalent column.

    Example: In a join between events and groupedmessage, if there is
    a condition on events.project_id, it would replicate the same
    condition on groupedmessage.project_id as this is a semantically
    equivalent column.

    The goal is to reduce the amount of data that is loaded by clickhouse
    for each subquery by adding all the conditions we can to all
    subqueries.

    Cases we skip:
    - top level conditions that include columns in multiple tables.
      These cannot be pushed down to subqueries.
    - top level conditions containing multiple columns as some may
      not have a semantic equivalent. TODO: This can be extended by
      supporting conditions that contain multiple column which all
      have an equivalent in the same entity
    """

    from_clause = query.get_from_clause()
    if isinstance(from_clause, CompositeQuery):
        add_equivalent_conditions(from_clause)
        return
    elif isinstance(from_clause, ProcessableQuery):
        return

    # Now this has to be a join, so we can work with it.

    alias_to_entity = {
        alias: entity_from_node(node)
        for alias, node in from_clause.get_alias_node_map().items()
    }
    entity_to_alias: MutableMapping[EntityKey, Set[str]] = {}
    for alias, entity in alias_to_entity.items():
        entity_to_alias.setdefault(entity, set()).add(alias)

    column_equivalence = get_equivalent_columns(from_clause)
    condition = query.get_condition()
    if condition is None:
        return

    and_components = get_first_level_and_conditions(condition)
    conditions_to_add = []
    for sub_condition in and_components:
        # We duplicate only the top level conditions that reference one
        # and only one column that has a semantic equivalent.
        # This excludes top level conditions that contains columns from
        # multiple entities, and cannot be pushed down to subqueries.
        #
        # TODO: Address top level conditions that contain multiple
        # columns each of which has an equivalent in the same entity.
        sole_column = _classify_single_column_condition(
            sub_condition, alias_to_entity)
        if sole_column is not None:
            column_in_condition, table_alias_in_condition = sole_column

            for equivalent_table_alias in entity_to_alias[
                    column_in_condition.entity]:
                if equivalent_table_alias != table_alias_in_condition:
                    # There are multiple occurrences of the entity found.
                    # Apply the same condition everywhere.
                    replacer = partial(
                        _replace_col,
                        table_alias_in_condition,
                        column_in_condition.column,
                        equivalent_table_alias,
                        column_in_condition.column,
                    )
                    conditions_to_add.append(sub_condition.transform(replacer))

            for equivalent in column_equivalence.get(column_in_condition, []):
                # There are equivalent column on different entities
                # in the query. Transform the condition and add it
                # to all entities.
                equivalent_aliases = entity_to_alias.get(
                    equivalent.entity, set())
                for table_alias in equivalent_aliases:
                    replacer = partial(
                        _replace_col,
                        table_alias_in_condition,
                        column_in_condition.column,
                        table_alias,
                        equivalent.column,
                    )
                    conditions_to_add.append(sub_condition.transform(replacer))

    query.set_ast_condition(
        combine_and_conditions([*and_components, *conditions_to_add]))
Example #26
0
    def _get_condition_without_redundant_checks(
        self, condition: Expression, query: Query
    ) -> Expression:
        """Optimizes the case where the query condition contains the following:

        valueOf('my_tag') != '' AND valueOf('my_tag') == "something"
                          ^                            ^
                          |                            |
                      existence check               value check

        the existence check in this clause is redundant and prevents the hashmap
        optimization from being applied.

        This function will remove all tag existence checks
        from the condition IFF they are ANDed with a value check for the *same tag name*

        Side effects:
            This function works by flattening first level AND conditions to find clauses where
            existence checks and value checks are ANDed together. When the AND conditions are recombined,
            they are not guaranteed to be in the same structure (but are guaranteed to be functionally equivalent)

            Example:
                ┌───┐         ┌───┐
                │AND│         │AND│
                ├──┬┘         └┬──┤
                │  │           │  │
             ┌──┴┐ c           a ┌┴──┐
             │AND│    becomes    │AND│
             └┬─┬┘               ├──┬┘
              │ │                │  │
              a b                b  c
        """
        if not isinstance(condition, FunctionExpr):
            return condition
        elif condition.function_name == BooleanFunctions.OR:
            sub_conditions = get_first_level_or_conditions(condition)
            pruned_conditions = [
                self._get_condition_without_redundant_checks(c, query)
                for c in sub_conditions
            ]
            return combine_or_conditions(pruned_conditions)
        elif condition.function_name == BooleanFunctions.AND:
            sub_conditions = get_first_level_and_conditions(condition)
            tag_eq_match_strings = set()
            matched_tag_exists_conditions = {}
            for condition_id, cond in enumerate(sub_conditions):
                tag_exist_match = None
                for tag_exists_pattern in self.__tag_exists_patterns:
                    tag_exist_match = tag_exists_pattern.match(cond)
                    if tag_exist_match:
                        matched_tag_exists_conditions[condition_id] = tag_exist_match
                if not tag_exist_match:
                    eq_match = self.__optimizable_pattern.match(cond)
                    if eq_match:
                        tag_eq_match_strings.add(eq_match.string(KEY_MAPPING_PARAM))
            useful_conditions = []
            for condition_id, cond in enumerate(sub_conditions):
                tag_exist_match = matched_tag_exists_conditions.get(condition_id, None)
                if tag_exist_match:
                    requested_tag = tag_exist_match.string("key")
                    if requested_tag in tag_eq_match_strings:
                        # the clause is redundant, thus we continue the loop
                        # and do not add it to useful_conditions
                        continue
                useful_conditions.append(
                    self._get_condition_without_redundant_checks(cond, query)
                )
            return combine_and_conditions(useful_conditions)
        else:
            return condition
Example #27
0
def test_subscription_worker(subscription_data: SubscriptionData) -> None:
    broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(),
                                                    TestingClock())

    result_topic = Topic("subscription-results")

    broker.create_topic(result_topic, partitions=1)

    frequency = timedelta(minutes=1)
    evaluations = 3

    subscription = Subscription(
        SubscriptionIdentifier(PartitionId(0), uuid1()),
        subscription_data,
    )

    store = DummySubscriptionDataStore()
    store.create(subscription.identifier.uuid, subscription.data)

    metrics = DummyMetricsBackend(strict=True)

    dataset = get_dataset("events")
    worker = SubscriptionWorker(
        dataset,
        ThreadPoolExecutor(),
        {
            0: SubscriptionScheduler(store, PartitionId(0), timedelta(),
                                     metrics)
        },
        broker.get_producer(),
        result_topic,
        metrics,
    )

    now = datetime(2000, 1, 1)

    tick = Tick(
        offsets=Interval(0, 1),
        timestamps=Interval(now - (frequency * evaluations), now),
    )

    result_futures = worker.process_message(
        Message(Partition(Topic("events"), 0), 0, tick, now))

    assert result_futures is not None and len(result_futures) == evaluations

    # Publish the results.
    worker.flush_batch([result_futures])

    # Check to make sure the results were published.
    # NOTE: This does not cover the ``SubscriptionTaskResultCodec``!
    consumer = broker.get_consumer("group")
    consumer.subscribe([result_topic])

    for i in range(evaluations):
        timestamp = now - frequency * (evaluations - i)

        message = consumer.poll()
        assert message is not None
        assert message.partition.topic == result_topic

        task, future = result_futures[i]
        future_result = request, result = future.result()
        assert message.payload.task.timestamp == timestamp
        assert message.payload == SubscriptionTaskResult(task, future_result)

        # NOTE: The time series extension is folded back into the request
        # body, ideally this would reference the timeseries options in
        # isolation.
        from_pattern = FunctionCall(
            String(ConditionFunctions.GTE),
            (
                Column(None, String("timestamp")),
                Literal(Datetime(timestamp - subscription.data.time_window)),
            ),
        )
        to_pattern = FunctionCall(
            String(ConditionFunctions.LT),
            (Column(None, String("timestamp")), Literal(Datetime(timestamp))),
        )

        condition = request.query.get_condition()
        assert condition is not None

        conditions = get_first_level_and_conditions(condition)

        assert any([from_pattern.match(e) for e in conditions])
        assert any([to_pattern.match(e) for e in conditions])

        assert result == {
            "meta": [{
                "name": "count",
                "type": "UInt64"
            }],
            "data": [{
                "count": 0
            }],
        }
Example #28
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Example #29
0
    def validate_required_conditions(
        self, query: Query, alias: Optional[str] = None
    ) -> bool:
        if not self._required_filter_columns and not self._required_time_column:
            return True

        condition = query.get_condition_from_ast()
        top_level = get_first_level_and_conditions(condition) if condition else []
        if not top_level:
            return False

        alias_match = AnyOptionalString() if alias is None else StringMatch(alias)

        def build_match(
            col: str, ops: Sequence[str], param_type: Any
        ) -> Or[Expression]:
            # The IN condition has to be checked separately since each parameter
            # has to be checked individually.
            column_match = ColumnMatch(alias_match, StringMatch(col))
            return Or(
                [
                    FunctionCallMatch(
                        Or([StringMatch(op) for op in ops]),
                        (column_match, LiteralMatch(AnyMatch(param_type))),
                    ),
                    FunctionCallMatch(
                        StringMatch(ConditionFunctions.IN),
                        (
                            column_match,
                            FunctionCallMatch(
                                Or([StringMatch("array"), StringMatch("tuple")]),
                                all_parameters=LiteralMatch(AnyMatch(param_type)),
                            ),
                        ),
                    ),
                ]
            )

        if self._required_filter_columns:
            for col in self._required_filter_columns:
                match = build_match(col, [ConditionFunctions.EQ], int)
                found = any(match.match(cond) for cond in top_level)
                if not found:
                    return False

        if self._required_time_column:
            match = build_match(
                self._required_time_column, [ConditionFunctions.EQ], datetime,
            )
            found = any(match.match(cond) for cond in top_level)
            if found:
                return True

            lower, upper = get_time_range_expressions(
                top_level, self._required_time_column, alias
            )
            if not lower or not upper:
                return False

            # At this point we have valid conditions. However we need to align them and
            # make sure they don't exceed the max_days. Replace the conditions.
            self._replace_time_condition(query, *lower, *upper)

        return True
Example #30
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = self.__prewhere_candidates

        # We remove the candidates that appear in a uniq or -If aggregations
        # because a query like `countIf(col=x) .. PREWHERE col=x` can make
        # the Clickhouse server crash.
        uniq_cols: Set[str] = set()
        expressions = query.get_all_expressions()
        for exp in expressions:
            if isinstance(exp,
                          FunctionCall) and (exp.function_name == "uniq" or
                                             exp.function_name.endswith("If")):
                columns = get_columns_in_expression(exp)
                for c in columns:
                    uniq_cols.add(c.column_name)

        for col in uniq_cols:
            if col in prewhere_keys:
                metrics.increment(
                    "uniq_col_in_prewhere_candidate",
                    tags={
                        "column": col,
                        "referrer": query_settings.referrer
                    },
                )

        prewhere_keys = [key for key in prewhere_keys if key not in uniq_cols]

        # In case the query is final we cannot simply add any candidate
        # condition to the prewhere.
        # Final is applied after prewhere, so there are cases where moving
        # conditions to the prewhere could exclude from the result sets
        # rows that would be merged under the `final` condition.
        # Example, rewriting the group_id on an unmerge. If the group_id
        # is in the prewhere, final wil fail at merging the rows.
        # HACK: If query has final, do not move any condition on a column in the
        # omit_if_final list to prewhere.
        # There is a bug in ClickHouse affecting queries with FINAL and PREWHERE
        # with Low Cardinality and Nullable columns.
        # https://github.com/ClickHouse/ClickHouse/issues/16171
        if query.get_from_clause().final and self.__omit_if_final:
            prewhere_keys = [
                key for key in prewhere_keys if key not in self.__omit_if_final
            ]

        if not prewhere_keys:
            return

        ast_condition = query.get_condition()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)