Ejemplo n.º 1
0
def filter_key_values(key_values: Expression,
                      keys: Sequence[LiteralExpr]) -> Expression:
    """
    Filter an array of key value pairs based on a sequence of keys
    (tag keys in this case).
    """
    return FunctionCallExpr(
        None,
        "arrayFilter",
        (
            Lambda(
                None,
                ("pair", ),
                in_condition(
                    # A pair here is a tuple with two elements (key
                    # and value) and the index of the first element in
                    # Clickhouse is 1 instead of 0.
                    tupleElement(
                        None,
                        Argument(None, "pair"),
                        LiteralExpr(None, 1),
                    ),
                    keys,
                ),
            ),
            key_values,
        ),
    )
Ejemplo n.º 2
0
def filter_column(column: Expression,
                  keys: Sequence[LiteralExpr]) -> Expression:
    return FunctionCallExpr(
        None,
        "arrayFilter",
        (Lambda(None,
                ("x", ), in_condition(Argument(None, "x"), keys)), column),
    )
def filter_keys(column: Expression, keys: Sequence[LiteralExpr]) -> Expression:
    """
    Filter a Column array based on a sequence of keys.
    """
    return FunctionCallExpr(
        None,
        "arrayFilter",
        (
            Lambda(None, ("tag",), in_condition(None, Argument(None, "tag"), keys),),
            column,
        ),
    )
Ejemplo n.º 4
0
    def process_query(
        self,
        query: Query,
        extension_data: ExtensionData,
        request_settings: RequestSettings,
    ) -> None:
        project_ids = util.to_list(extension_data["project"])

        if project_ids:
            query.add_condition_to_ast(
                in_condition(
                    Column(None, None, self.__project_column),
                    [Literal(None, p) for p in project_ids],
                ))

        request_settings.add_rate_limit(
            self._get_rate_limit_params(project_ids))
Ejemplo n.º 5
0

def _and(ex1: Expression, ex2: Expression) -> FunctionCall:
    return binary_condition(BooleanFunctions.AND, ex1, ex2)


# `errors > 0`
has_errors = binary_condition(ConditionFunctions.GT,
                              Column(None, None, "errors"), Literal(None, 0))
# `distinct_id != NIL`
did_not_nil = neq(distinct_id, lit_nil)
# `duration != MAX AND status == 1`
duration_condition = _and(neq(duration, Literal(None, MAX_UINT32)),
                          eq(status, Literal(None, 1)))
# `status IN (2,3,4)`
terminal_status = in_condition(status,
                               [Literal(None, status) for status in [2, 3, 4]])

# These here are basically the same statements as the matview query
sessions_raw_translators = TranslationMappers(columns=[
    ColumnToCurriedFunction(
        None,
        "duration_quantiles",
        FunctionCall(None, "quantilesIf", quantiles),
        (duration, duration_condition),
    ),
    ColumnToFunction(None, "duration_avg", "avgIf", (duration,
                                                     duration_condition)),
    ColumnToFunction(
        None,
        "sessions",
        "sumIf",
from snuba.query.processors.conditions_enforcer import (
    MandatoryConditionEnforcer,
    OrgIdEnforcer,
    ProjectIdEnforcer,
)
from snuba.request.request_settings import HTTPRequestSettings
from snuba.state import set_config

test_data = [
    pytest.param(
        Query(
            Table("errors", ColumnSet([])),
            selected_columns=[],
            condition=binary_condition(
                BooleanFunctions.AND,
                in_condition(Column(None, None, "project_id"), [Literal(None, 123)]),
                binary_condition(
                    "equals", Column(None, None, "org_id"), Literal(None, 1)
                ),
            ),
        ),
        True,
        id="Valid query. Both mandatory columns are there",
    ),
    pytest.param(
        Query(
            Table("errors", ColumnSet([])),
            selected_columns=[],
            condition=binary_condition(
                BooleanFunctions.AND,
                binary_condition(
Ejemplo n.º 7
0
     build_query(),
     set(),
     id="no op filter",
 ),
 pytest.param(
     build_query(condition=binary_condition(
         ConditionFunctions.EQ,
         spans_op_col,
         Literal(None, "db"),
     ), ),
     {"db"},
     id="simple equality",
 ),
 pytest.param(
     build_query(condition=in_condition(
         spans_op_col,
         [Literal(None, "db"), Literal(None, "http")],
     ), ),
     {"db", "http"},
     id="op IN condition",
 ),
 pytest.param(
     build_query(
         condition=in_condition(
             spans_op_col,
             [Literal(None, "db")],
         ),
         having=in_condition(
             spans_op_col,
             [Literal(None, "http")],
         ),
     ),
Ejemplo n.º 8
0
     binary_condition(
         BooleanFunctions.AND,
         binary_condition(
             BooleanFunctions.AND,
             binary_condition(
                 ConditionFunctions.GTE,
                 Column("_snuba_timestamp", None, "timestamp"),
                 Literal(None, datetime(2011, 7, 1, 19, 54, 15)),
             ),
             binary_condition(
                 ConditionFunctions.LT,
                 Column("_snuba_timestamp", None, "timestamp"),
                 Literal(None, datetime(2018, 7, 6, 19, 54, 15)),
             ),
         ),
         in_condition(Column("_snuba_project_id", None, "project_id"),
                      [Literal(None, 1)]),
     ),
     id="Legacy query",
 ),
 pytest.param(
     {
         "query": ("MATCH (events) "
                   "SELECT count() AS count BY time "
                   "WHERE "
                   "project_id IN tuple(1) AND "
                   "timestamp >= toDateTime('2011-07-01T19:54:15') AND"
                   "timestamp < toDateTime('2018-07-06T19:54:15') "
                   "LIMIT 1000 "
                   "GRANULARITY 60"),
     },
     Language.SNQL,
Ejemplo n.º 9
0
     id="simple equality",
 ),
 pytest.param(
     build_query(
         selected_columns=[
             FunctionCall(
                 "tags_key",
                 "arrayJoin",
                 (Column(None, None, "tags.key"), ),
             ),
         ],
         condition=in_condition(
             FunctionCall(
                 "tags_key",
                 "arrayJoin",
                 (Column(None, None, "tags.key"), ),
             ),
             [Literal(None, "tag1"),
              Literal(None, "tag2")],
         ),
     ),
     {"tag1", "tag2"},
     id="tag IN condition",
 ),
 pytest.param(
     build_query(
         selected_columns=[
             FunctionCall(
                 "tags_key",
                 "arrayJoin",
                 (Column(None, None, "tags.key"), ),
Ejemplo n.º 10
0
def test_tags_expander() -> None:
    query_body = {
        "selected_columns": [
            ["f1", ["tags_key", "column2"], "f1_alias"],
            ["f2", [], "f2_alias"],
        ],
        "aggregations": [
            ["count", "platform", "platforms"],
            ["testF", ["platform", "tags_value"], "top_platforms"],
        ],
        "conditions": [["tags_key", "=", "tags_key"]],
        "having": [["tags_value", "IN", ["tag"]]],
    }

    events = get_dataset("events")
    query = parse_query(query_body, events)

    processor = TagsExpanderProcessor()
    request_settings = HTTPRequestSettings()
    processor.process_query(query, request_settings)

    assert query.get_selected_columns_from_ast() == [
        SelectedExpression(
            "platforms",
            FunctionCall("platforms", "count",
                         (Column("platform", None, "platform"), )),
        ),
        SelectedExpression(
            "top_platforms",
            FunctionCall(
                "top_platforms",
                "testF",
                (
                    Column("platform", None, "platform"),
                    FunctionCall("tags_value", "arrayJoin",
                                 (Column(None, None, "tags.value"), )),
                ),
            ),
        ),
        SelectedExpression(
            "f1_alias",
            FunctionCall(
                "f1_alias",
                "f1",
                (
                    FunctionCall("tags_key", "arrayJoin",
                                 (Column(None, None, "tags.key"), )),
                    Column("column2", None, "column2"),
                ),
            ),
        ),
        SelectedExpression("f2_alias", FunctionCall("f2_alias", "f2",
                                                    tuple())),
    ]

    assert query.get_condition_from_ast() == binary_condition(
        None,
        OPERATOR_TO_FUNCTION["="],
        FunctionCall("tags_key", "arrayJoin",
                     (Column(None, None, "tags.key"), )),
        Literal(None, "tags_key"),
    )

    assert query.get_having_from_ast() == in_condition(
        None,
        FunctionCall("tags_value", "arrayJoin",
                     (Column(None, None, "tags.value"), )),
        [Literal(None, "tag")],
    )
Ejemplo n.º 11
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Ejemplo n.º 12
0
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
Ejemplo n.º 13
0
def test_tags_expander() -> None:
    query_body = """
    MATCH (events)
    SELECT count(platform) AS platforms, testF(platform, tags_value) AS top_platforms, f1(tags_key, column2) AS f1_alias, f2() AS f2_alias
    WHERE tags_key = 'tags_key'
    AND project_id = 1
    AND timestamp >= toDateTime('2020-01-01 12:00:00')
    AND timestamp < toDateTime('2020-01-02 12:00:00')
    HAVING tags_value IN tuple('tag')
    """

    events = get_dataset("events")
    query, _ = parse_snql_query(query_body, events)

    processor = TagsExpanderProcessor()
    query_settings = HTTPQuerySettings()
    processor.process_query(query, query_settings)

    assert query.get_selected_columns() == [
        SelectedExpression(
            "platforms",
            FunctionCall(
                "_snuba_platforms",
                "count",
                (Column("_snuba_platform", None, "platform"), ),
            ),
        ),
        SelectedExpression(
            "top_platforms",
            FunctionCall(
                "_snuba_top_platforms",
                "testF",
                (
                    Column("_snuba_platform", None, "platform"),
                    FunctionCall(
                        "_snuba_tags_value",
                        "arrayJoin",
                        (Column(None, None, "tags.value"), ),
                    ),
                ),
            ),
        ),
        SelectedExpression(
            "f1_alias",
            FunctionCall(
                "_snuba_f1_alias",
                "f1",
                (
                    FunctionCall(
                        "_snuba_tags_key",
                        "arrayJoin",
                        (Column(None, None, "tags.key"), ),
                    ),
                    Column("_snuba_column2", None, "column2"),
                ),
            ),
        ),
        SelectedExpression("f2_alias",
                           FunctionCall("_snuba_f2_alias", "f2", tuple())),
    ]

    condition = query.get_condition()
    assert condition is not None
    conds = get_first_level_and_conditions(condition)
    assert conds[0] == binary_condition(
        OPERATOR_TO_FUNCTION["="],
        FunctionCall("_snuba_tags_key", "arrayJoin",
                     (Column(None, None, "tags.key"), )),
        Literal(None, "tags_key"),
    )

    assert query.get_having() == in_condition(
        FunctionCall("_snuba_tags_value", "arrayJoin",
                     (Column(None, None, "tags.value"), )),
        [Literal(None, "tag")],
    )