Python Query.get_selected_columns_from_ast Beispiele, snuba.clickhouse.query.Query.get_selected_columns_from_ast Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_mapping_promoter.py Projekt: isabella232/snuba

def test_format_expressions(name: str, query: ClickhouseQuery,
                            expected_query: ClickhouseQuery) -> None:
    MappingColumnPromoter({
        "tags": {
            "promoted_tag": "promoted"
        }
    }).process_query(query, HTTPRequestSettings())

    assert (query.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())

Beispiel #2

0

Datei anzeigen

def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]:
    """
    Identifies the conditions we can apply the arrayFilter optimization
    on.
    Which means: if the arrayJoin is in the select clause, there
    are one or more top level AND condition on the arrayJoin and
    there is no OR condition in the query.
    """
    array_join_found = any(
        array_join_pattern(column_name).match(f) is not None
        for selected in query.get_selected_columns_from_ast() or []
        for f in selected.expression)

    if not array_join_found:
        return set()

    ast_condition = query.get_condition_from_ast()
    cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name)
                 if ast_condition is not None else set())
    if cond_keys is None:
        # This means we found an OR. Cowardly we give up even though there could
        # be cases where this condition is still optimizable.
        return set()

    ast_having = query.get_having_from_ast()
    having_keys = (_get_mapping_keys_in_condition(ast_having, column_name)
                   if ast_having is not None else set())
    if having_keys is None:
        # Same as above
        return set()

    return cond_keys | having_keys

Beispiel #3

0

Datei anzeigen

 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader[SqlQuery]) -> QueryResult:
     assert query.get_selected_columns_from_ast() == [
         SelectedExpression(
             "duration_quantiles",
             CurriedFunctionCall(
                 "duration_quantiles",
                 FunctionCall(
                     None,
                     "quantilesIfMerge",
                     (Literal(None, 0.5), Literal(None, 0.9)),
                 ),
                 (Column(None, None, "duration_quantiles"), ),
             ),
         ),
         SelectedExpression(
             "sessions",
             FunctionCall("sessions", "countIfMerge",
                          (Column(None, None, "sessions"), )),
         ),
         SelectedExpression(
             "users",
             FunctionCall("users", "uniqIfMerge",
                          (Column(None, None, "users"), )),
         ),
     ]
     return QueryResult({}, {})

Beispiel #4

0

Datei anzeigen

Datei: astquery.py Projekt: cafebazaar/snuba

    def __init__(self, query: Query, settings: RequestSettings,) -> None:
        # Clickhouse query structure
        # Referencing them here directly since it makes it easier
        # to process this query independently from the Clickhouse Query
        # and there is no risk in doing so since they are immutable.
        self.__selected_columns = query.get_selected_columns_from_ast()
        self.__condition = query.get_condition_from_ast()
        self.__groupby = query.get_groupby_from_ast()
        self.__having = query.get_having_from_ast()
        self.__orderby = query.get_orderby_from_ast()
        self.__data_source = query.get_data_source()
        self.__arrayjoin = query.get_arrayjoin_from_ast()
        self.__granularity = query.get_granularity()
        self.__limit = query.get_limit()
        self.__limitby = query.get_limitby()
        self.__offset = query.get_offset()

        if self.__having:
            assert self.__groupby, "found HAVING clause with no GROUP BY"

        self.__turbo = settings.get_turbo()
        self.__final = query.get_final()
        self.__sample = query.get_sample()
        self.__hastotals = query.has_totals()
        self.__prewhere = query.get_prewhere_ast()

        self.__settings = settings
        self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None
        self.__formatted_query: Optional[str] = None
        self.__sql_data: Optional[Mapping[str, str]] = None

Beispiel #5

0

Datei anzeigen

Datei: test_events_processing.py Projekt: fpacifici/snuba

 def query_runner(
     query: Query, settings: RequestSettings, reader: Reader[SqlQuery]
 ) -> QueryResult:
     assert query.get_selected_columns_from_ast() == [
         SelectedExpression(
             "tags[transaction]", Column("tags[transaction]", None, "transaction")
         ),
         SelectedExpression(
             "contexts[browser.name]",
             FunctionCall(
                 "contexts[browser.name]",
                 "arrayElement",
                 (
                     Column(None, None, "contexts.value"),
                     FunctionCall(
                         None,
                         "indexOf",
                         (
                             Column(None, None, "contexts.key"),
                             Literal(None, "browser.name"),
                         ),
                     ),
                 ),
             ),
         ),
     ]
     return QueryResult({}, {})

Beispiel #6

0

Datei anzeigen

    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader) -> QueryResult:

        if events_storage.get_storage_key() == StorageKey.EVENTS:
            transaction_col_name = "transaction"
        else:
            transaction_col_name = "transaction_name"

        assert query.get_selected_columns_from_ast() == [
            SelectedExpression(
                "tags[transaction]",
                Column("_snuba_tags[transaction]", None, transaction_col_name),
            ),
            SelectedExpression(
                "contexts[browser.name]",
                FunctionCall(
                    "_snuba_contexts[browser.name]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "browser.name"),
                            ),
                        ),
                    ),
                ),
            ),
        ]
        return QueryResult({}, {})

Beispiel #7

0

Datei anzeigen

def test_event_id_column_format_expressions() -> None:
    unprocessed = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "transaction.duration", Column("transaction.duration", None, "duration")
            ),
            SelectedExpression(
                "the_event_id", Column("the_event_id", None, "event_id")
            ),
        ],
    )
    expected = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "transaction.duration", Column("transaction.duration", None, "duration")
            ),
            SelectedExpression(
                "the_event_id",
                FunctionCall(
                    "the_event_id",
                    "replaceAll",
                    (
                        FunctionCall(
                            None, "toString", (Column(None, None, "event_id"),),
                        ),
                        Literal(None, "-"),
                        Literal(None, ""),
                    ),
                ),
            ),
        ],
    )

    EventIdColumnProcessor().process_query(unprocessed, HTTPRequestSettings())
    assert (
        expected.get_selected_columns_from_ast()
        == unprocessed.get_selected_columns_from_ast()
    )

    formatted = unprocessed.get_selected_columns_from_ast()[1].expression.accept(
        ClickhouseExpressionFormatter()
    )
    assert formatted == "(replaceAll(toString(event_id), '-', '') AS the_event_id)"

Beispiel #8

0

Datei anzeigen

Datei: test_events_column_processor.py Projekt: isabella232/snuba

def test_events_column_format_expressions() -> None:
    unprocessed = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")),
            SelectedExpression(
                "the_group_id", Column("the_group_id", None, "group_id")
            ),
            SelectedExpression("the_message", Column("the_message", None, "message")),
        ],
    )
    expected = Query(
        Table("events", ColumnSet([])),
        selected_columns=[
            SelectedExpression("dr_claw", Column("dr_claw", None, "culprit")),
            SelectedExpression(
                "the_group_id",
                FunctionCall(
                    "the_group_id",
                    "nullIf",
                    (Column(None, None, "group_id"), Literal(None, 0),),
                ),
            ),
            SelectedExpression("the_message", Column("the_message", None, "message"),),
        ],
    )

    GroupIdColumnProcessor().process_query(unprocessed, HTTPRequestSettings())
    assert (
        expected.get_selected_columns_from_ast()
        == unprocessed.get_selected_columns_from_ast()
    )

    expected = (
        "(nullIf(group_id, 0) AS the_group_id)",
        "(message AS the_message)",
    )

    for idx, column in enumerate(unprocessed.get_selected_columns_from_ast()[1:]):
        formatted = column.expression.accept(ClickhouseExpressionFormatter())
        assert expected[idx] == formatted

Beispiel #9

0

Datei anzeigen

Datei: test_arrayjoin_optimizer.py Projekt: isabella232/snuba

def test_tags_processor(query_body: MutableMapping[str, Any],
                        expected_query: ClickhouseQuery) -> None:
    """
    Tests the whole processing in some notable cases.
    """
    processed = parse_and_process(query_body)
    assert (processed.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert processed.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
    assert processed.get_having_from_ast(
    ) == expected_query.get_having_from_ast()

Beispiel #10

0

Datei anzeigen

Datei: query.py Projekt: fpacifici/snuba

def _run_and_apply_column_names(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Executes the query and, after that, replaces the column names in
    QueryResult with the names the user expects and that are stored in
    the SelectedExpression objects in the Query.
    This happens so that we can remove aliases from the Query AST since
    those aliases now are needed to produce the names the user expects
    in the output.
    """

    result = _format_storage_query_and_run(
        timer,
        query_metadata,
        from_date,
        to_date,
        referrer,
        clickhouse_query,
        request_settings,
        reader,
    )

    name_alias_mappings = [
        (select.name, select.expression.alias)
        for select in clickhouse_query.get_selected_columns_from_ast()
    ]
    discrepancies = [(name, alias) for name, alias in name_alias_mappings
                     if name != alias]
    if discrepancies:
        logger.warning(
            "Discrepancies between select clause names and aliases",
            extra={"mappings": discrepancies},
            exc_info=True,
        )

    # TODO actually replace the column names in the result (data and
    # meta) with the names the user expects from the SelectedExpression
    # objects.
    # As of now, to ensure the column names are what the user expects,
    # we rely on the aliases assigned to the AST expressions after parsing.
    # That's why we should not have the discrepancies logged above.

    return result

Beispiel #11

0

Datei anzeigen

Datei: test_mapping.py Projekt: isabella232/snuba

def test_translation(mappers: TranslationMappers, query: SnubaQuery,
                     expected: ClickhouseQuery) -> None:
    translated = QueryTranslator(mappers).translate(query)

    # TODO: consider providing an __eq__ method to the Query class. Or turn it into
    # a dataclass.
    assert (expected.get_selected_columns_from_ast() ==
            translated.get_selected_columns_from_ast())
    assert expected.get_groupby_from_ast() == translated.get_groupby_from_ast()
    assert expected.get_condition_from_ast(
    ) == translated.get_condition_from_ast()
    assert expected.get_arrayjoin_from_ast(
    ) == translated.get_arrayjoin_from_ast()
    assert expected.get_having_from_ast() == translated.get_having_from_ast()
    assert expected.get_orderby_from_ast() == translated.get_orderby_from_ast()

Beispiel #12

0

Datei anzeigen

Datei: test_split.py Projekt: fpacifici/snuba

 def do_query(
     query: ClickhouseQuery,
     request_settings: RequestSettings,
     reader: Reader[SqlQuery],
 ) -> QueryResult:
     selected_col_names = [
         c.expression.column_name
         for c in query.get_selected_columns_from_ast() or []
         if isinstance(c.expression, Column)
     ]
     if selected_col_names == list(first_query_data[0].keys()):
         return QueryResult({"data": first_query_data}, {})
     elif selected_col_names == list(second_query_data[0].keys()):
         return QueryResult({"data": second_query_data}, {})
     else:
         raise ValueError(f"Unexpected selected columns: {selected_col_names}")

Beispiel #13

0

Datei anzeigen

 def query_runner(query: Query, settings: RequestSettings,
                  reader: Reader) -> QueryResult:
     quantiles = tuple(
         Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1])
     assert query.get_selected_columns_from_ast() == [
         SelectedExpression(
             "duration_quantiles",
             CurriedFunctionCall(
                 "_snuba_duration_quantiles",
                 FunctionCall(
                     None,
                     "quantilesIfMerge",
                     quantiles,
                 ),
                 (Column(None, None, "duration_quantiles"), ),
             ),
         ),
         SelectedExpression(
             "sessions",
             FunctionCall(
                 "_snuba_sessions",
                 "plus",
                 (
                     FunctionCall(None, "countIfMerge",
                                  (Column(None, None, "sessions"), )),
                     FunctionCall(
                         None,
                         "sumIfMerge",
                         (Column(None, None, "sessions_preaggr"), ),
                     ),
                 ),
             ),
         ),
         SelectedExpression(
             "users",
             FunctionCall("_snuba_users", "uniqIfMerge",
                          (Column(None, None, "users"), )),
         ),
     ]
     return QueryResult({}, {})

Beispiel #14

0

Datei anzeigen

Datei: split.py Projekt: fpacifici/snuba

    def execute(
        self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (
            limit is None
            or limit == 0
            or query.get_groupby_from_ast()
            or not query.get_selected_columns_from_ast()
        ):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]),
            (Column(None, String(self.__id_column)), AnyExpression(),),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {
            (col.table_name, col.column_name)
            for col in query.get_all_ast_referenced_columns()
        }

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns(
            [
                SelectedExpression(
                    self.__id_column,
                    ColumnExpr(self.__id_column, None, self.__id_column),
                ),
                SelectedExpression(
                    self.__project_column,
                    ColumnExpr(self.__project_column, None, self.__project_column),
                ),
                SelectedExpression(
                    self.__timestamp_column,
                    ColumnExpr(self.__timestamp_column, None, self.__timestamp_column),
                ),
            ]
        )

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                self.__id_column,
                self.__project_column,
                self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]])
        )
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment("column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            )
        )
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([event[self.__project_column] for event in result.result["data"]])
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [event[self.__timestamp_column] for event in result.result["data"]]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)

Beispiel #15

0

Datei anzeigen

def test_events_boolean_context() -> None:
    columns = ColumnSet([
        ("device_charging", UInt(8, Modifier(nullable=True))),
        ("contexts", Nested([("key", String()), ("value", String())])),
    ])
    query = ClickhouseQuery(
        TableSource("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "device.charging"),
                            ),
                        ),
                    ),
                ),
            )
        ],
    )

    expected = ClickhouseQuery(
        TableSource("events", columns),
        selected_columns=[
            SelectedExpression(
                "contexts[device.charging]",
                FunctionCall(
                    "contexts[device.charging]",
                    "multiIf",
                    (
                        binary_condition(
                            None,
                            ConditionFunctions.EQ,
                            FunctionCall(
                                None,
                                "toString",
                                (Column(None, None, "device_charging"), ),
                            ),
                            Literal(None, ""),
                        ),
                        Literal(None, ""),
                        binary_condition(
                            None,
                            ConditionFunctions.IN,
                            FunctionCall(
                                None,
                                "toString",
                                (Column(None, None, "device_charging"), ),
                            ),
                            literals_tuple(
                                None,
                                [Literal(None, "1"),
                                 Literal(None, "True")]),
                        ),
                        Literal(None, "True"),
                        Literal(None, "False"),
                    ),
                ),
            )
        ],
    )

    settings = HTTPRequestSettings()
    MappingColumnPromoter({
        "contexts": {
            "device.charging": "device_charging"
        }
    }).process_query(query, settings)
    EventsBooleanContextsProcessor().process_query(query, settings)

    assert (query.get_selected_columns_from_ast() ==
            expected.get_selected_columns_from_ast())

Beispiel #16

0

Datei anzeigen

Datei: test_query_ast.py Projekt: isabella232/snuba

def test_replace_expression() -> None:
    """
    Create a query with the new AST and replaces a function with a different function
    replaces f1(...) with tag(f1)
    """
    column1 = Column(None, "t1", "c1")
    column2 = Column(None, "t1", "c2")
    function_1 = FunctionCall("alias", "f1", (column1, column2))
    function_2 = FunctionCall("alias", "f2", (column2, ))

    condition = binary_condition(ConditionFunctions.EQ, function_1,
                                 Literal(None, "1"))

    prewhere = binary_condition(ConditionFunctions.EQ, function_1,
                                Literal(None, "2"))

    orderby = OrderBy(OrderByDirection.ASC, function_2)

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[SelectedExpression("alias", function_1)],
        array_join=None,
        condition=condition,
        groupby=[function_1],
        having=None,
        prewhere=prewhere,
        order_by=[orderby],
    )

    def replace(exp: Expression) -> Expression:
        if isinstance(exp, FunctionCall) and exp.function_name == "f1":
            return FunctionCall(exp.alias, "tag", (Literal(None, "f1"), ))
        return exp

    query.transform_expressions(replace)

    expected_query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "alias", FunctionCall("alias", "tag", (Literal(None, "f1"), )))
        ],
        array_join=None,
        condition=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"), )),
            Literal(None, "1"),
        ),
        groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"), ))],
        prewhere=binary_condition(
            ConditionFunctions.EQ,
            FunctionCall("alias", "tag", (Literal(None, "f1"), )),
            Literal(None, "2"),
        ),
        having=None,
        order_by=[orderby],
    )

    assert (query.get_selected_columns_from_ast() ==
            expected_query.get_selected_columns_from_ast())
    assert query.get_condition_from_ast(
    ) == expected_query.get_condition_from_ast()
    assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast(
    )
    assert query.get_having_from_ast() == expected_query.get_having_from_ast()
    assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast(
    )

    assert list(query.get_all_expressions()) == list(
        expected_query.get_all_expressions())