Esempio n. 1
0
        def process_column(exp: Expression) -> Expression:
            match = matcher.match(exp)

            if match:
                inner = replace(exp, alias=None)
                return FunctionCallExpr(
                    exp.alias,
                    "if",
                    (
                        binary_condition(
                            ConditionFunctions.IN,
                            inner,
                            literals_tuple(
                                None,
                                [
                                    LiteralExpr(None, "1"),
                                    LiteralExpr(None, "True")
                                ],
                            ),
                        ),
                        LiteralExpr(None, "True"),
                        LiteralExpr(None, "False"),
                    ),
                )

            return exp
Esempio n. 2
0
def filter_expression(
    columns: Expression,
    single_filtered: Dict[LiteralExpr, Sequence[str]],
    multiple_filtered: Dict[Tuple[LiteralExpr, ...], Sequence[Tuple[str,
                                                                    ...]]],
) -> Expression:
    argument_name = "arg"
    argument = Argument(None, argument_name)

    conditions: List[Expression] = []

    for index in single_filtered:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                tupleElement(None, argument, index),
                FunctionCallExpr(
                    None,
                    "tuple",
                    tuple(
                        LiteralExpr(None, f) for f in single_filtered[index]),
                ),
            ))

    for indices in multiple_filtered:
        conditions.append(
            binary_condition(
                ConditionFunctions.IN,
                FunctionCallExpr(
                    None,
                    "tuple",
                    tuple(
                        tupleElement(None, argument, index)
                        for index in indices),
                ),
                FunctionCallExpr(
                    None,
                    "tuple",
                    tuple(
                        FunctionCallExpr(
                            None,
                            "tuple",
                            tuple(LiteralExpr(None, t) for t in tuples),
                        ) for tuples in multiple_filtered[indices]),
                ),
            ))

    return FunctionCallExpr(
        None,
        "arrayFilter",
        (Lambda(None, (argument_name, ),
                combine_and_conditions(conditions)), columns),
    )
Esempio n. 3
0
 def _produce_output(self, expression: ColumnExpr) -> FunctionCallExpr:
     if not self.nullable:
         return build_mapping_expr(
             expression.alias,
             self.to_nested_col_table_name,
             self.to_nested_col_name,
             LiteralExpr(None, self.to_nested_mapping_key),
         )
     else:
         return build_nullable_mapping_expr(
             expression.alias,
             self.to_nested_col_table_name,
             self.to_nested_col_name,
             LiteralExpr(None, self.to_nested_mapping_key),
         )
Esempio n. 4
0
def filter_key_values(key_values: Expression,
                      keys: Sequence[LiteralExpr]) -> Expression:
    """
    Filter an array of key value pairs based on a sequence of keys
    (tag keys in this case).
    """
    return FunctionCallExpr(
        None,
        "arrayFilter",
        (
            Lambda(
                None,
                ("pair", ),
                in_condition(
                    # A pair here is a tuple with two elements (key
                    # and value) and the index of the first element in
                    # Clickhouse is 1 instead of 0.
                    tupleElement(
                        None,
                        Argument(None, "pair"),
                        LiteralExpr(None, 1),
                    ),
                    keys,
                ),
            ),
            key_values,
        ),
    )
Esempio n. 5
0
 def __replace_with_hash(self, condition: Expression) -> Expression:
     match = self.__optimizable_pattern.match(condition)
     if (
         match is None
         or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key"
     ):
         return condition
     rhs = match.expression("right_hand_side")
     assert isinstance(rhs, LiteralExpr)
     key = match.string(KEY_MAPPING_PARAM).translate(ESCAPE_TRANSLATION)
     return FunctionExpr(
         alias=condition.alias,
         function_name="has",
         parameters=(
             Column(
                 alias=None,
                 table_name=match.optional_string(TABLE_MAPPING_PARAM),
                 column_name=self.__hash_map_name,
             ),
             FunctionExpr(
                 alias=None,
                 function_name="cityHash64",
                 parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
             ),
         ),
     )
Esempio n. 6
0
def filtered_mapping_keys(alias: Optional[str], column_name: str,
                          filtered: Sequence[str]) -> Expression:
    return arrayJoin(
        alias,
        filter_column(
            ColumnExpr(None, None, column_name),
            [LiteralExpr(None, f) for f in filtered],
        ),
    )
        def replace_expression(expr: Expression) -> Expression:
            """
            Applies the appropriate optimization on a single arrayJoin expression.
            """
            match = arrayjoin_pattern.match(expr)
            if match is None:
                return expr

            if arrayjoins_in_query == {
                key_column(self.__column_name),
                val_column(self.__column_name),
            }:
                # Both arrayJoin(col.key) and arrayJoin(col.value) expressions
                # present int the query. Do the arrayJoin on key-value pairs
                # instead of independent arrayjoin for keys and values.
                array_index = (
                    LiteralExpr(None, 1)
                    if match.string("col") == key_column(self.__column_name)
                    else LiteralExpr(None, 2)
                )

                if not filtered_keys:
                    return _unfiltered_mapping_pairs(
                        expr.alias, self.__column_name, pair_alias, array_index
                    )
                else:
                    return _filtered_mapping_pairs(
                        expr.alias,
                        self.__column_name,
                        pair_alias,
                        filtered_keys,
                        array_index,
                    )

            elif filtered_keys:
                # Only one between arrayJoin(col.key) and arrayJoin(col.value)
                # is present, and it is arrayJoin(col.key) since we found
                # filtered keys.
                return _filtered_mapping_keys(
                    expr.alias, self.__column_name, filtered_keys
                )
            else:
                # No viable optimization
                return expr
Esempio n. 8
0
        def replace_exp(exp: Expression) -> Expression:
            matcher = FunctionCall(
                String("notEquals"),
                (Column(None, String("type")), Literal(String("transaction"))),
            )

            if matcher.match(exp):
                return LiteralExpr(None, 1)

            return exp
Esempio n. 9
0
    def __find_tuple_index(self, column_name: str) -> LiteralExpr:
        """
        Translates a column name to a tuple index. Used for accessing the specific
        column from within the single optimized arrayJoin.

        This should only be used when ALL possible columns are present in the select
        clause of the query because it assumes a specific ordering. If any of the
        possible columns are missing from the select clause, then the ordering will
        be not as expected.
        """
        for i, col in enumerate(self.all_columns):
            if column_name == col:
                return LiteralExpr(None, i + 1)
        raise ValueError(f"Unknown column: {column_name}")
Esempio n. 10
0
def build_nullable_mapping_expr(
    alias: Optional[str],
    table_name: Optional[str],
    col_name: str,
    mapping_key: Expression,
) -> FunctionCallExpr:
    # TODO: Add a pattern for this expression if we need it.
    return FunctionCallExpr(
        alias,
        "if",
        (
            FunctionCallExpr(
                None,
                "has",
                (ColumnExpr(None, table_name, f"{col_name}.key"), mapping_key),
            ),
            build_mapping_expr(None, table_name, col_name, mapping_key),
            LiteralExpr(None, None),
        ),
    )
Esempio n. 11
0
def generate_bloom_filter_condition(
    column_name: str,
    single_filtered: Dict[str, Sequence[str]],
    multiple_filtered: Dict[Tuple[str, ...], Sequence[Tuple[str, ...]]],
) -> Optional[Expression]:
    """
    Generate the filters on the array columns to use the bloom filter index on
    the spans.op and spans.group columns in order to filter the transactions
    prior to the array join.

    The bloom filter index is requires the use of the has function, therefore
    the final condition is built up from a series of has conditions.
    """

    per_key_vals: Dict[str, Set[str]] = defaultdict(set)

    for key, single_filter in single_filtered.items():
        for val in single_filter:
            per_key_vals[key].add(val)

    for keys, multiple_filter in multiple_filtered.items():
        for val_tuple in multiple_filter:
            for key, val in zip(keys, val_tuple):
                per_key_vals[key].add(val)

    conditions = [
        combine_or_conditions([
            FunctionCallExpr(
                None,
                "has",
                (ColumnExpr(None, None, key), LiteralExpr(None, val)),
            ) for val in sorted(vals)
        ]) for key, vals in per_key_vals.items()
    ]

    return combine_and_conditions(conditions) if conditions else None
Esempio n. 12
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby()
        if not orderby or orderby[0] != f"-{self.__timestamp_col}":
            return None

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, ">=")),
            None,
        )

        to_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, "<")),
            None,
        )
        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if not from_date_str or not to_date_str:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        to_date = util.parse_datetime(to_date_str, date_align)
        from_date = util.parse_datetime(from_date_str, date_align)

        if from_date != from_date_ast:
            logger.warning(
                "Mismatch in start date on time splitter.",
                extra={
                    "ast": str(from_date_ast),
                    "legacy": str(from_date)
                },
                exc_info=True,
            )
            metrics.increment("mismatch.ast_from_date")

        remaining_offset = query.get_offset()

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_condition(split_query, self.__timestamp_col, ">=",
                               split_start.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_condition(split_query, self.__timestamp_col, "<",
                               split_end.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result
Esempio n. 13
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        arrayjoin_pattern = FunctionCall(
            String("arrayJoin"),
            (Column(column_name=Param(
                "col",
                Or([
                    String(key_column(self.__column_name)),
                    String(val_column(self.__column_name)),
                ]),
            ), ), ),
        )

        arrayjoins_in_query = set()
        for e in query.get_all_expressions():
            match = arrayjoin_pattern.match(e)
            if match is not None:
                arrayjoins_in_query.add(match.string("col"))

        filtered_keys = [
            LiteralExpr(None, key)
            for key in get_filtered_mapping_keys(query, self.__column_name)
        ]

        # Ensures the alias we apply to the arrayJoin is not already taken.
        used_aliases = {exp.alias for exp in query.get_all_expressions()}
        pair_alias_root = f"snuba_all_{self.__column_name}"
        pair_alias = pair_alias_root
        index = 0
        while pair_alias in used_aliases:
            index += 1
            pair_alias = f"{pair_alias_root}_{index}"

        def replace_expression(expr: Expression) -> Expression:
            """
            Applies the appropriate optimization on a single arrayJoin expression.
            """
            match = arrayjoin_pattern.match(expr)
            if match is None:
                return expr

            if arrayjoins_in_query == {
                    key_column(self.__column_name),
                    val_column(self.__column_name),
            }:
                # Both arrayJoin(col.key) and arrayJoin(col.value) expressions
                # present int the query. Do the arrayJoin on key-value pairs
                # instead of independent arrayjoin for keys and values.
                array_index = (LiteralExpr(
                    None, 1) if match.string("col") == key_column(
                        self.__column_name) else LiteralExpr(None, 2))

                if not filtered_keys:
                    return _unfiltered_mapping_pairs(expr.alias,
                                                     self.__column_name,
                                                     pair_alias, array_index)
                else:
                    return _filtered_mapping_pairs(
                        expr.alias,
                        self.__column_name,
                        pair_alias,
                        filtered_keys,
                        array_index,
                    )

            elif filtered_keys:
                # Only one between arrayJoin(col.key) and arrayJoin(col.value)
                # is present, and it is arrayJoin(col.key) since we found
                # filtered keys.
                return _filtered_mapping_keys(expr.alias, self.__column_name,
                                              filtered_keys)
            else:
                # No viable optimization
                return expr

        query.transform_expressions(replace_expression)
Esempio n. 14
0
         "irrelevant",
         "irrelevant",
         (
             ColumnExpr("relevant", "relevant", "relevant"),
             ColumnExpr("relevant", "relevant", "relevant"),
         ),
     ),
     None,
 ),
 (
     "Does not match any Column",
     FunctionCall(None, (Param("p1", Any(ColumnExpr)), )),
     FunctionCallExpr(
         "irrelevant",
         "irrelevant",
         (LiteralExpr(None, "str"), ),
     ),
     None,
 ),
 (
     "Union of two patterns - match",
     Or([
         Param("option1", Column(None, String("col_name"))),
         Param("option2", Column(None, String("other_col_name"))),
     ]),
     ColumnExpr(None, None, "other_col_name"),
     MatchResult({"option2": ColumnExpr(None, None, "other_col_name")}),
 ),
 (
     "Union of two patterns - no match",
     Or([
Esempio n. 15
0
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
Esempio n. 16
0
 def _produce_output(self, expression: ColumnExpr) -> LiteralExpr:
     return LiteralExpr(alias=expression.alias, value=self.to_literal_value)
Esempio n. 17
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby_from_ast():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby_from_ast()
        if (not orderby or orderby[0].direction != OrderByDirection.DESC
                or not isinstance(orderby[0].expression, ColumnExpr) or
                not orderby[0].expression.column_name == self.__timestamp_col):
            return None

        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if from_date_ast is None or to_date_ast is None:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        assert isinstance(split_step, int)
        remaining_offset = query.get_offset()

        overall_result: Optional[QueryResult] = None
        split_end = to_date_ast
        split_start = max(split_end - timedelta(seconds=split_step),
                          from_date_ast)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step),
                        from_date_ast)
                except OverflowError:
                    split_start = from_date_ast

        return overall_result
Esempio n. 18
0
    FunctionCall,
    Literal,
    MatchResult,
    OptionalString,
    Or,
    Param,
    Pattern,
    String,
    SubscriptableReference,
)

test_cases = [
    (
        "Literal match",
        Literal(None),
        LiteralExpr("random_alias", 1),
        MatchResult(),
    ),
    (
        "Literal match with none type",
        Literal(Any(type(None))),
        LiteralExpr("alias", 1),
        None,
    ),
    (
        "Single node match",
        Column(OptionalString("table"), String("test_col")),
        ColumnExpr("alias_we_don't_care_of", "table", "test_col"),
        MatchResult(),
    ),
    (
Esempio n. 19
0
    Any,
    AnyExpression,
    AnyOptionalString,
    Column,
    FunctionCall,
    Literal,
    MatchResult,
    OptionalString,
    Or,
    Param,
    Pattern,
    String,
)

test_cases = [
    ("Literal match", Literal(None), LiteralExpr("random_alias", 1), MatchResult(),),
    (
        "Literal match with none type",
        Literal(Any(type(None))),
        LiteralExpr("alias", 1),
        None,
    ),
    (
        "Single node match",
        Column(OptionalString("table"), String("test_col")),
        ColumnExpr("alias_we_don't_care_of", "table", "test_col"),
        MatchResult(),
    ),
    (
        "Single node no match",
        Column(None, String("test_col")),
Esempio n. 20
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Esempio n. 21
0
    Literal as LiteralExpr,
)
from snuba.query.validation import InvalidFunctionCall
from snuba.query.validation.signature import (
    Any,
    Column,
    Literal,
    ParamType,
    SignatureValidator,
)

test_cases = [
    pytest.param(
        (
            ColumnExpr(alias=None, table_name=None, column_name="event_id"),
            LiteralExpr(None, "param"),
        ),
        [Any(), Any()],
        False,
        False,
        id="Valid Expression",
    ),
    pytest.param(
        (
            ColumnExpr(alias=None, table_name=None, column_name="event_id"),
            LiteralExpr(None, "param"),
        ),
        [Column({String}), Any()],
        False,
        False,
        id="Valid Specific Expression",