Ejemplo n.º 1
0
def test_join_optimizer_two_tables(
    selected_cols: Sequence[Any],
    conditions: Sequence[Condition],
    groupby: Groupby,
    expected: str,
) -> None:
    query = Query(
        {
            "selected_columns": selected_cols,
            "conditions": conditions,
            "arrayjoin": None,
            "having": [],
            "groupby": groupby,
            "aggregations": [],
            "orderby": None,
            "limitby": None,
            "sample": 10,
            "limit": 100,
            "offset": 50,
            "totals": True,
            "granularity": 60,
        },
        simple_join_structure,
    )
    request_settings = HTTPRequestSettings()

    optimizer = SimpleJoinOptimizer()
    optimizer.process_query(query, request_settings)

    assert query.get_data_source().format_from() == expected
Ejemplo n.º 2
0
def test_full_query():
    query = Query(
        {
            "selected_columns": ["c1", "c2", "c3"],
            "conditions": [["c1", "=", "a"]],
            "arrayjoin": "tags",
            "having": [["c4", "=", "c"]],
            "groupby": ["project_id"],
            "aggregations": [["count()", "", "count"]],
            "orderby": "event_id",
            "limitby": (100, "environment"),
            "sample": 10,
            "limit": 100,
            "offset": 50,
            "totals": True,
            "granularity": 60,
        },
        TableSource("my_table", ColumnSet([])),
    )

    assert query.get_selected_columns() == ["c1", "c2", "c3"]
    assert query.get_aggregations() == [["count()", "", "count"]]
    assert query.get_groupby() == ["project_id"]
    assert query.get_conditions() == [["c1", "=", "a"]]
    assert query.get_arrayjoin() == "tags"
    assert query.get_having() == [["c4", "=", "c"]]
    assert query.get_orderby() == "event_id"
    assert query.get_limitby() == (100, "environment")
    assert query.get_sample() == 10
    assert query.get_limit() == 100
    assert query.get_offset() == 50
    assert query.has_totals() is True
    assert query.get_granularity() == 60

    assert query.get_data_source().format_from() == "my_table"
Ejemplo n.º 3
0
    def process_query(
        self,
        query: Query,
        request_settings: RequestSettings,
    ) -> None:
        from_clause = query.get_data_source()
        if not isinstance(from_clause, JoinClause):
            return

        referenced_columns = query.get_all_referenced_columns()
        referenced_aliases = set()
        for qualified_column in referenced_columns:
            # This will be much better when we will represent columns
            # with a more structured data type than strings.
            match = QUALIFIED_COLUMN_REGEX.match(qualified_column)
            if match:
                # match[1] is the first parenthesized group in the regex, thus
                # the table alias.
                table_alias = match[1]
                referenced_aliases.add(table_alias)

        assert (len(referenced_aliases) >
                0), "Trying to otpimize a join query without aliases"
        if len(referenced_aliases) > 1:
            return

        from_tables = from_clause.get_tables()
        table = from_tables[referenced_aliases.pop()]

        query.set_data_source(table)
Ejemplo n.º 4
0
def test_prewhere(initial_table, consistent, expected_table) -> None:
    state.set_config("enable_events_readonly_table", True)
    body = {
        "conditions": [
            ["d", "=", "1"],
            ["c", "=", "3"],
            ["a", "=", "1"],
            ["b", "=", "2"],
        ],
    }
    cols = ColumnSet([("col", String())])
    query = Query(
        body,
        TableSource(initial_table, cols, [["time", "=", "1"]], ["c1"]),
    )

    request_settings = HTTPRequestSettings(consistent=consistent)
    processor = ReadOnlyTableSelector("sentry_dist", "sentry_ro")
    processor.process_query(query, request_settings)

    source = query.get_data_source()
    assert isinstance(source, TableSource)
    assert source.format_from() == expected_table
    assert source.get_columns() == cols
    assert source.get_prewhere_candidates() == ["c1"]
    assert source.get_mandatory_conditions() == [["time", "=", "1"]]
Ejemplo n.º 5
0
    def test_data_source(
        self, query_body: MutableMapping[str, Any], expected_dataset: str
    ):
        query = Query(query_body, get_dataset_source("discover"))

        request_settings = HTTPRequestSettings()
        for processor in get_dataset("discover").get_query_processors():
            processor.process_query(query, request_settings)

        assert (
            query.get_data_source().format_from()
            == get_dataset_source(expected_dataset).format_from()
        )
Ejemplo n.º 6
0
def test_empty_query():
    query = Query({}, TableSource("my_table", ColumnSet([])))

    assert query.get_selected_columns() is None
    assert query.get_aggregations() is None
    assert query.get_groupby() is None
    assert query.get_conditions() is None
    assert query.get_arrayjoin() is None
    assert query.get_having() == []
    assert query.get_orderby() is None
    assert query.get_limitby() is None
    assert query.get_sample() is None
    assert query.get_limit() is None
    assert query.get_offset() == 0
    assert query.has_totals() is False

    assert query.get_data_source().format_from() == "my_table"
Ejemplo n.º 7
0
 def process_query(self, query: Query, request_settings: RequestSettings,) -> None:
     max_prewhere_conditions: int = (
         self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS
     )
     prewhere_keys = query.get_data_source().get_prewhere_candidates()
     if not prewhere_keys:
         return
     prewhere_conditions: Sequence[Condition] = []
     # Add any condition to PREWHERE if:
     # - It is a single top-level condition (not OR-nested), and
     # - Any of its referenced columns are in prewhere_keys
     conditions = query.get_conditions()
     if not conditions:
         return
     prewhere_candidates = [
         (util.columns_in_expr(cond[0]), cond)
         for cond in conditions
         if util.is_condition(cond)
         and any(col in prewhere_keys for col in util.columns_in_expr(cond[0]))
     ]
     # Use the condition that has the highest priority (based on the
     # position of its columns in the prewhere keys list)
     prewhere_candidates = sorted(
         [
             (
                 min(
                     prewhere_keys.index(col) for col in cols if col in prewhere_keys
                 ),
                 cond,
             )
             for cols, cond in prewhere_candidates
         ],
         key=lambda priority_and_col: priority_and_col[0],
     )
     if prewhere_candidates:
         prewhere_conditions = [cond for _, cond in prewhere_candidates][
             :max_prewhere_conditions
         ]
         query.set_conditions(
             list(filter(lambda cond: cond not in prewhere_conditions, conditions))
         )
     query.set_prewhere(prewhere_conditions)
Ejemplo n.º 8
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        readonly_enabled = state.get_config("enable_events_readonly_table",
                                            False)
        if not readonly_enabled:
            return

        if request_settings.get_consistent():
            return

        data_source = query.get_data_source()

        if data_source.format_from() != self.__table_to_replace:
            return

        new_source = TableSource(
            table_name=self.__read_only_table,
            columns=data_source.get_columns(),
            mandatory_conditions=data_source.get_mandatory_conditions(),
            prewhere_candidates=data_source.get_prewhere_candidates(),
        )
        query.set_data_source(new_source)
Ejemplo n.º 9
0
    def __init__(
        self,
        query: Query,
        settings: RequestSettings,
    ) -> None:
        # Snuba query structure
        # Referencing them here directly since it makes it easier
        # to process this query independently from the Snuba Query
        # and there is no risk in doing so since they are immutable.
        self.__selected_columns = query.get_selected_columns_from_ast()
        self.__condition = query.get_condition_from_ast()
        self.__groupby = query.get_groupby_from_ast()
        self.__having = query.get_having_from_ast()
        self.__orderby = query.get_orderby_from_ast()
        self.__data_source = query.get_data_source()
        self.__arrayjoin = query.get_arrayjoin_from_ast()
        self.__granularity = query.get_granularity()
        self.__limit = query.get_limit()
        self.__limitby = query.get_limitby()
        self.__offset = query.get_offset()

        if self.__having:
            assert self.__groupby, "found HAVING clause with no GROUP BY"

        # Clickhouse specific fields. Some are still in the Snuba
        # query and have to be moved.
        self.__turbo = settings.get_turbo()
        self.__final = query.get_final()
        self.__sample = query.get_sample()
        self.__hastotals = query.has_totals()
        # TODO: Pre where processing will become a step in Clickhouse Query processing
        # instead of being pulled from the Snuba Query
        self.__prewhere = query.get_prewhere_ast()

        self.__settings = settings
        self.__formatted_query: Optional[str] = None
Ejemplo n.º 10
0
    def __init__(
        self,
        dataset: Dataset,
        query: Query,
        settings: RequestSettings,
    ) -> None:
        parsing_context = ParsingContext()

        aggregate_exprs = [
            column_expr(dataset, col, query, parsing_context, alias, agg)
            for (agg, col, alias) in query.get_aggregations()
        ]
        groupby = util.to_list(query.get_groupby())
        group_exprs = [
            column_expr(dataset, gb, query, parsing_context) for gb in groupby
        ]
        column_names = query.get_selected_columns() or []
        selected_cols = [
            column_expr(dataset, util.tuplify(colname), query, parsing_context)
            for colname in column_names
        ]
        select_clause = u"SELECT {}".format(
            ", ".join(group_exprs + aggregate_exprs + selected_cols))

        from_clause = u"FROM {}".format(query.get_data_source().format_from())

        if query.get_final():
            from_clause = u"{} FINAL".format(from_clause)

        if not query.get_data_source().supports_sample():
            sample_rate = None
        else:
            if query.get_sample():
                sample_rate = query.get_sample()
            elif settings.get_turbo():
                sample_rate = snuba_settings.TURBO_SAMPLE_RATE
            else:
                sample_rate = None

        if sample_rate:
            from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate)

        join_clause = ""
        if query.get_arrayjoin():
            join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin())

        where_clause = ""
        if query.get_conditions():
            where_clause = u"WHERE {}".format(
                conditions_expr(dataset, query.get_conditions(), query,
                                parsing_context))

        prewhere_clause = ""
        if query.get_prewhere():
            prewhere_clause = u"PREWHERE {}".format(
                conditions_expr(dataset, query.get_prewhere(), query,
                                parsing_context))

        group_clause = ""
        if groupby:
            group_clause = "GROUP BY ({})".format(", ".join(
                column_expr(dataset, gb, query, parsing_context)
                for gb in groupby))
            if query.has_totals():
                group_clause = "{} WITH TOTALS".format(group_clause)

        having_clause = ""
        having_conditions = query.get_having()
        if having_conditions:
            assert groupby, "found HAVING clause with no GROUP BY"
            having_clause = u"HAVING {}".format(
                conditions_expr(dataset, having_conditions, query,
                                parsing_context))

        order_clause = ""
        if query.get_orderby():
            orderby = [
                column_expr(dataset, util.tuplify(ob), query, parsing_context)
                for ob in util.to_list(query.get_orderby())
            ]
            orderby = [
                u"{} {}".format(ob.lstrip("-"),
                                "DESC" if ob.startswith("-") else "ASC")
                for ob in orderby
            ]
            order_clause = u"ORDER BY {}".format(", ".join(orderby))

        limitby_clause = ""
        if query.get_limitby() is not None:
            limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby())

        limit_clause = ""
        if query.get_limit() is not None:
            limit_clause = "LIMIT {}, {}".format(query.get_offset(),
                                                 query.get_limit())

        self.__formatted_query = " ".join([
            c for c in [
                select_clause,
                from_clause,
                join_clause,
                prewhere_clause,
                where_clause,
                group_clause,
                having_clause,
                order_clause,
                limitby_clause,
                limit_clause,
            ] if c
        ])
Ejemplo n.º 11
0
Archivo: query.py Proyecto: Appva/snuba
    def __init__(
        self,
        dataset: Dataset,
        query: Query,
        settings: RequestSettings,
        prewhere_conditions: Sequence[str],
    ) -> None:
        parsing_context = ParsingContext()

        aggregate_exprs = [
            column_expr(dataset, col, query, parsing_context, alias, agg)
            for (agg, col, alias) in query.get_aggregations()
        ]
        groupby = util.to_list(query.get_groupby())
        group_exprs = [
            column_expr(dataset, gb, query, parsing_context) for gb in groupby
        ]
        column_names = query.get_selected_columns() or []
        selected_cols = [
            column_expr(dataset, util.tuplify(colname), query, parsing_context)
            for colname in column_names
        ]
        select_clause = u'SELECT {}'.format(
            ', '.join(group_exprs + aggregate_exprs + selected_cols))

        from_clause = u'FROM {}'.format(query.get_data_source().format_from())

        if query.get_final():
            from_clause = u'{} FINAL'.format(from_clause)

        if query.get_sample():
            sample_rate = query.get_sample()
        elif settings.get_turbo():
            sample_rate = snuba_settings.TURBO_SAMPLE_RATE
        else:
            sample_rate = None

        if sample_rate:
            from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate)

        join_clause = ''
        if query.get_arrayjoin():
            join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin())

        where_clause = ''
        if query.get_conditions():
            where_clause = u'WHERE {}'.format(
                conditions_expr(dataset, query.get_conditions(), query,
                                parsing_context))

        prewhere_clause = ''
        if prewhere_conditions:
            prewhere_clause = u'PREWHERE {}'.format(
                conditions_expr(dataset, prewhere_conditions, query,
                                parsing_context))

        group_clause = ''
        if groupby:
            group_clause = 'GROUP BY ({})'.format(', '.join(
                column_expr(dataset, gb, query, parsing_context)
                for gb in groupby))
            if query.has_totals():
                group_clause = '{} WITH TOTALS'.format(group_clause)

        having_clause = ''
        having_conditions = query.get_having()
        if having_conditions:
            assert groupby, 'found HAVING clause with no GROUP BY'
            having_clause = u'HAVING {}'.format(
                conditions_expr(dataset, having_conditions, query,
                                parsing_context))

        order_clause = ''
        if query.get_orderby():
            orderby = [
                column_expr(dataset, util.tuplify(ob), query, parsing_context)
                for ob in util.to_list(query.get_orderby())
            ]
            orderby = [
                u'{} {}'.format(ob.lstrip('-'),
                                'DESC' if ob.startswith('-') else 'ASC')
                for ob in orderby
            ]
            order_clause = u'ORDER BY {}'.format(', '.join(orderby))

        limitby_clause = ''
        if query.get_limitby() is not None:
            limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby())

        limit_clause = ''
        if query.get_limit() is not None:
            limit_clause = 'LIMIT {}, {}'.format(query.get_offset(),
                                                 query.get_limit())

        self.__formatted_query = ' '.join([
            c for c in [
                select_clause, from_clause, join_clause, prewhere_clause,
                where_clause, group_clause, having_clause, order_clause,
                limitby_clause, limit_clause
            ] if c
        ])