def test_format_expressions(name: str, query: ClickhouseQuery,
                            expected_query: ClickhouseQuery) -> None:
    MappingColumnPromoter({
        "tags": {
            "promoted_tag": "promoted"
        }
    }).process_query(query, HTTPRequestSettings())

    assert (query.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
Beispiel #2
0
def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]:
    """
    Identifies the conditions we can apply the arrayFilter optimization
    on.
    Which means: if the arrayJoin is in the select clause, there
    are one or more top level AND condition on the arrayJoin and
    there is no OR condition in the query.
    """
    array_join_found = any(
        array_join_pattern(column_name).match(f) is not None
        for selected in query.get_selected_columns_from_ast() or []
        for f in selected.expression)

    if not array_join_found:
        return set()

    ast_condition = query.get_condition_from_ast()
    cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name)
                 if ast_condition is not None else set())
    if cond_keys is None:
        # This means we found an OR. Cowardly we give up even though there could
        # be cases where this condition is still optimizable.
        return set()

    ast_having = query.get_having_from_ast()
    having_keys = (_get_mapping_keys_in_condition(ast_having, column_name)
                   if ast_having is not None else set())
    if having_keys is None:
        # Same as above
        return set()

    return cond_keys | having_keys
Beispiel #3
0
 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader[SqlQuery]) -> QueryResult:
     assert query.get_selected_columns_from_ast() == [
         SelectedExpression(
             "duration_quantiles",
             CurriedFunctionCall(
                 "duration_quantiles",
                 FunctionCall(
                     None,
                     "quantilesIfMerge",
                     (Literal(None, 0.5), Literal(None, 0.9)),
                 ),
                 (Column(None, None, "duration_quantiles"), ),
             ),
         ),
         SelectedExpression(
             "sessions",
             FunctionCall("sessions", "countIfMerge",
                          (Column(None, None, "sessions"), )),
         ),
         SelectedExpression(
             "users",
             FunctionCall("users", "uniqIfMerge",
                          (Column(None, None, "users"), )),
         ),
     ]
     return QueryResult({}, {})
Beispiel #4
0
    def __init__(self, query: Query, settings: RequestSettings,) -> None:
        # Clickhouse query structure
        # Referencing them here directly since it makes it easier
        # to process this query independently from the Clickhouse Query
        # and there is no risk in doing so since they are immutable.
        self.__selected_columns = query.get_selected_columns_from_ast()
        self.__condition = query.get_condition_from_ast()
        self.__groupby = query.get_groupby_from_ast()
        self.__having = query.get_having_from_ast()
        self.__orderby = query.get_orderby_from_ast()
        self.__data_source = query.get_data_source()
        self.__arrayjoin = query.get_arrayjoin_from_ast()
        self.__granularity = query.get_granularity()
        self.__limit = query.get_limit()
        self.__limitby = query.get_limitby()
        self.__offset = query.get_offset()

        if self.__having:
            assert self.__groupby, "found HAVING clause with no GROUP BY"

        self.__turbo = settings.get_turbo()
        self.__final = query.get_final()
        self.__sample = query.get_sample()
        self.__hastotals = query.has_totals()
        self.__prewhere = query.get_prewhere_ast()

        self.__settings = settings
        self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None
        self.__formatted_query: Optional[str] = None
        self.__sql_data: Optional[Mapping[str, str]] = None
 def query_runner(
     query: Query, settings: RequestSettings, reader: Reader[SqlQuery]
 ) -> QueryResult:
     assert query.get_selected_columns_from_ast() == [
         SelectedExpression(
             "tags[transaction]", Column("tags[transaction]", None, "transaction")
         ),
         SelectedExpression(
             "contexts[browser.name]",
             FunctionCall(
                 "contexts[browser.name]",
                 "arrayElement",
                 (
                     Column(None, None, "contexts.value"),
                     FunctionCall(
                         None,
                         "indexOf",
                         (
                             Column(None, None, "contexts.key"),
                             Literal(None, "browser.name"),
                         ),
                     ),
                 ),
             ),
         ),
     ]
     return QueryResult({}, {})
Beispiel #6
0
    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader) -> QueryResult:

        if events_storage.get_storage_key() == StorageKey.EVENTS:
            transaction_col_name = "transaction"
        else:
            transaction_col_name = "transaction_name"

        assert query.get_selected_columns_from_ast() == [
            SelectedExpression(
                "tags[transaction]",
                Column("_snuba_tags[transaction]", None, transaction_col_name),
            ),
            SelectedExpression(
                "contexts[browser.name]",
                FunctionCall(
                    "_snuba_contexts[browser.name]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "browser.name"),
                            ),
                        ),
                    ),
                ),
            ),
        ]
        return QueryResult({}, {})
Beispiel #7
0
def test_event_id_column_format_expressions() -> None:
    unprocessed = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "transaction.duration", Column("transaction.duration", None, "duration")
            ),
            SelectedExpression(
                "the_event_id", Column("the_event_id", None, "event_id")
            ),
        ],
    )
    expected = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "transaction.duration", Column("transaction.duration", None, "duration")
            ),
            SelectedExpression(
                "the_event_id",
                FunctionCall(
                    "the_event_id",
                    "replaceAll",
                    (
                        FunctionCall(
                            None, "toString", (Column(None, None, "event_id"),),
                        ),
                        Literal(None, "-"),
                        Literal(None, ""),
                    ),
                ),
            ),
        ],
    )

    EventIdColumnProcessor().process_query(unprocessed, HTTPRequestSettings())
    assert (
        expected.get_selected_columns_from_ast()
        == unprocessed.get_selected_columns_from_ast()
    )

    formatted = unprocessed.get_selected_columns_from_ast()[1].expression.accept(
        ClickhouseExpressionFormatter()
    )
    assert formatted == "(replaceAll(toString(event_id), '-', '') AS the_event_id)"
def test_events_column_format_expressions() -> None:
    unprocessed = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")),
            SelectedExpression(
                "the_group_id", Column("the_group_id", None, "group_id")
            ),
            SelectedExpression("the_message", Column("the_message", None, "message")),
        ],
    )
    expected = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")),
            SelectedExpression(
                "the_group_id",
                FunctionCall(
                    "the_group_id",
                    "nullIf",
                    (Column(None, None, "group_id"), Literal(None, 0),),
                ),
            ),
            SelectedExpression("the_message", Column("the_message", None, "message"),),
        ],
    )

    GroupIdColumnProcessor().process_query(unprocessed, HTTPRequestSettings())
    assert (
        expected.get_selected_columns_from_ast()
        == unprocessed.get_selected_columns_from_ast()
    )

    expected = (
        "(nullIf(group_id, 0) AS the_group_id)",
        "(message AS the_message)",
    )

    for idx, column in enumerate(unprocessed.get_selected_columns_from_ast()[1:]):
        formatted = column.expression.accept(ClickhouseExpressionFormatter())
        assert expected[idx] == formatted
def test_tags_processor(query_body: MutableMapping[str, Any],
                        expected_query: ClickhouseQuery) -> None:
    """
    Tests the whole processing in some notable cases.
    """
    processed = parse_and_process(query_body)
    assert (processed.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert processed.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
    assert processed.get_having_from_ast(
    ) == expected_query.get_having_from_ast()
Beispiel #10
0
def _run_and_apply_column_names(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Executes the query and, after that, replaces the column names in
    QueryResult with the names the user expects and that are stored in
    the SelectedExpression objects in the Query.
    This happens so that we can remove aliases from the Query AST since
    those aliases now are needed to produce the names the user expects
    in the output.
    """

    result = _format_storage_query_and_run(
        timer,
        query_metadata,
        from_date,
        to_date,
        referrer,
        clickhouse_query,
        request_settings,
        reader,
    )

    name_alias_mappings = [
        (select.name, select.expression.alias)
        for select in clickhouse_query.get_selected_columns_from_ast()
    ]
    discrepancies = [(name, alias) for name, alias in name_alias_mappings
                     if name != alias]
    if discrepancies:
        logger.warning(
            "Discrepancies between select clause names and aliases",
            extra={"mappings": discrepancies},
            exc_info=True,
        )

    # TODO actually replace the column names in the result (data and
    # meta) with the names the user expects from the SelectedExpression
    # objects.
    # As of now, to ensure the column names are what the user expects,
    # we rely on the aliases assigned to the AST expressions after parsing.
    # That's why we should not have the discrepancies logged above.

    return result
Beispiel #11
0
def test_translation(mappers: TranslationMappers, query: SnubaQuery,
                     expected: ClickhouseQuery) -> None:
    translated = QueryTranslator(mappers).translate(query)

    # TODO: consider providing an __eq__ method to the Query class. Or turn it into
    # a dataclass.
    assert (expected.get_selected_columns_from_ast() ==
            translated.get_selected_columns_from_ast())
    assert expected.get_groupby_from_ast() == translated.get_groupby_from_ast()
    assert expected.get_condition_from_ast(
    ) == translated.get_condition_from_ast()
    assert expected.get_arrayjoin_from_ast(
    ) == translated.get_arrayjoin_from_ast()
    assert expected.get_having_from_ast() == translated.get_having_from_ast()
    assert expected.get_orderby_from_ast() == translated.get_orderby_from_ast()
Beispiel #12
0
 def do_query(
     query: ClickhouseQuery,
     request_settings: RequestSettings,
     reader: Reader[SqlQuery],
 ) -> QueryResult:
     selected_col_names = [
         c.expression.column_name
         for c in query.get_selected_columns_from_ast() or []
         if isinstance(c.expression, Column)
     ]
     if selected_col_names == list(first_query_data[0].keys()):
         return QueryResult({"data": first_query_data}, {})
     elif selected_col_names == list(second_query_data[0].keys()):
         return QueryResult({"data": second_query_data}, {})
     else:
         raise ValueError(f"Unexpected selected columns: {selected_col_names}")
Beispiel #13
0
 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader) -> QueryResult:
     quantiles = tuple(
         Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1])
     assert query.get_selected_columns_from_ast() == [
         SelectedExpression(
             "duration_quantiles",
             CurriedFunctionCall(
                 "_snuba_duration_quantiles",
                 FunctionCall(
                     None,
                     "quantilesIfMerge",
                     quantiles,
                 ),
                 (Column(None, None, "duration_quantiles"), ),
             ),
         ),
         SelectedExpression(
             "sessions",
             FunctionCall(
                 "_snuba_sessions",
                 "plus",
                 (
                     FunctionCall(None, "countIfMerge",
                                  (Column(None, None, "sessions"), )),
                     FunctionCall(
                         None,
                         "sumIfMerge",
                         (Column(None, None, "sessions_preaggr"), ),
                     ),
                 ),
             ),
         ),
         SelectedExpression(
             "users",
             FunctionCall("_snuba_users", "uniqIfMerge",
                          (Column(None, None, "users"), )),
         ),
     ]
     return QueryResult({}, {})
Beispiel #14
0
    def execute(
        self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (
            limit is None
            or limit == 0
            or query.get_groupby_from_ast()
            or not query.get_selected_columns_from_ast()
        ):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]),
            (Column(None, String(self.__id_column)), AnyExpression(),),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {
            (col.table_name, col.column_name)
            for col in query.get_all_ast_referenced_columns()
        }

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns(
            [
                SelectedExpression(
                    self.__id_column,
                    ColumnExpr(self.__id_column, None, self.__id_column),
                ),
                SelectedExpression(
                    self.__project_column,
                    ColumnExpr(self.__project_column, None, self.__project_column),
                ),
                SelectedExpression(
                    self.__timestamp_column,
                    ColumnExpr(self.__timestamp_column, None, self.__timestamp_column),
                ),
            ]
        )

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                self.__id_column,
                self.__project_column,
                self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]])
        )
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment("column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            )
        )
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([event[self.__project_column] for event in result.result["data"]])
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [event[self.__timestamp_column] for event in result.result["data"]]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Beispiel #15
0
def test_events_boolean_context() -> None:
    columns = ColumnSet([
        ("device_charging", UInt(8, Modifier(nullable=True))),
        ("contexts", Nested([("key", String()), ("value", String())])),
    ])
    query = ClickhouseQuery(
        TableSource("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "device.charging"),
                            ),
                        ),
                    ),
                ),
            )
        ],
    )

    expected = ClickhouseQuery(
        TableSource("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "multiIf",
                    (
                        binary_condition(
                            None,
                            ConditionFunctions.EQ,
                            FunctionCall(
                                None,
                                "toString",
                                (Column(None, None, "device_charging"), ),
                            ),
                            Literal(None, ""),
                        ),
                        Literal(None, ""),
                        binary_condition(
                            None,
                            ConditionFunctions.IN,
                            FunctionCall(
                                None,
                                "toString",
                                (Column(None, None, "device_charging"), ),
                            ),
                            literals_tuple(
                                None,
                                [Literal(None, "1"),
                                 Literal(None, "True")]),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                ),
            )
        ],
    )

    settings = HTTPRequestSettings()
    MappingColumnPromoter({
        "contexts": {
            "device.charging": "device_charging"
        }
    }).process_query(query, settings)
    EventsBooleanContextsProcessor().process_query(query, settings)

    assert (query.get_selected_columns_from_ast() ==
            expected.get_selected_columns_from_ast())
Beispiel #16
0
def test_replace_expression() -> None:
    """
    Create a query with the new AST and replaces a function with a different function
    replaces f1(...) with tag(f1)
    """
    column1 = Column(None, "t1", "c1")
    column2 = Column(None, "t1", "c2")
    function_1 = FunctionCall("alias", "f1", (column1, column2))
    function_2 = FunctionCall("alias", "f2", (column2, ))

    condition = binary_condition(ConditionFunctions.EQ, function_1,
                                 Literal(None, "1"))

    prewhere = binary_condition(ConditionFunctions.EQ, function_1,
                                Literal(None, "2"))

    orderby = OrderBy(OrderByDirection.ASC, function_2)

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[SelectedExpression("alias", function_1)],
        array_join=None,
        condition=condition,
        groupby=[function_1],
        having=None,
        prewhere=prewhere,
        order_by=[orderby],
    )

    def replace(exp: Expression) -> Expression:
        if isinstance(exp, FunctionCall) and exp.function_name == "f1":
            return FunctionCall(exp.alias, "tag", (Literal(None, "f1"), ))
        return exp

    query.transform_expressions(replace)

    expected_query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "alias", FunctionCall("alias", "tag", (Literal(None, "f1"), )))
        ],
        array_join=None,
        condition=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"), )),
            Literal(None, "1"),
        ),
        groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"), ))],
        prewhere=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"), )),
            Literal(None, "2"),
        ),
        having=None,
        order_by=[orderby],
    )

    assert (query.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert query.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
    assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast(
    )
    assert query.get_having_from_ast() == expected_query.get_having_from_ast()
    assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast(
    )

    assert list(query.get_all_expressions()) == list(
        expected_query.get_all_expressions())