Beispiel #1
0
    def __init__(self, query: Query, settings: RequestSettings,) -> None:
        # Clickhouse query structure
        # Referencing them here directly since it makes it easier
        # to process this query independently from the Clickhouse Query
        # and there is no risk in doing so since they are immutable.
        self.__selected_columns = query.get_selected_columns_from_ast()
        self.__condition = query.get_condition_from_ast()
        self.__groupby = query.get_groupby_from_ast()
        self.__having = query.get_having_from_ast()
        self.__orderby = query.get_orderby_from_ast()
        self.__data_source = query.get_data_source()
        self.__arrayjoin = query.get_arrayjoin_from_ast()
        self.__granularity = query.get_granularity()
        self.__limit = query.get_limit()
        self.__limitby = query.get_limitby()
        self.__offset = query.get_offset()

        if self.__having:
            assert self.__groupby, "found HAVING clause with no GROUP BY"

        self.__turbo = settings.get_turbo()
        self.__final = query.get_final()
        self.__sample = query.get_sample()
        self.__hastotals = query.has_totals()
        self.__prewhere = query.get_prewhere_ast()

        self.__settings = settings
        self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None
        self.__formatted_query: Optional[str] = None
        self.__sql_data: Optional[Mapping[str, str]] = None
Beispiel #2
0
    def process_query(self, query: Query, request_settings: RequestSettings,) -> None:
        from_clause = query.get_data_source()
        if not isinstance(from_clause, JoinClause):
            return

        referenced_columns = query.get_all_referenced_columns()
        referenced_aliases = set()
        for qualified_column in referenced_columns:
            # This will be much better when we will represent columns
            # with a more structured data type than strings.
            match = QUALIFIED_COLUMN_REGEX.match(qualified_column)
            if match:
                # match[1] is the first parenthesized group in the regex, thus
                # the table alias.
                table_alias = match[1]
                referenced_aliases.add(table_alias)

        assert (
            len(referenced_aliases) > 0
        ), "Trying to otpimize a join query without aliases"
        if len(referenced_aliases) > 1:
            return

        from_tables = from_clause.get_tables()
        table = from_tables[referenced_aliases.pop()]

        query.set_data_source(table)
Beispiel #3
0
def _format_storage_query_and_run(
    timer: Timer,
    query_metadata: SnubaQueryMetadata,
    from_date: datetime,
    to_date: datetime,
    referrer: str,
    clickhouse_query: Query,
    request_settings: RequestSettings,
    reader: Reader[SqlQuery],
) -> QueryResult:
    """
    Formats the Storage Query and pass it to the DB specific code for execution.
    """

    # TODO: This function (well, it will be a wrapper of this function)
    # where we will transform the result according to the SelectedExpression
    # object in the query to ensure the fields in the QueryResult have
    # the same name the user expects.

    source = clickhouse_query.get_data_source().format_from()
    with sentry_sdk.start_span(description="create_query", op="db") as span:
        formatted_query = AstSqlQuery(clickhouse_query, request_settings)
        span.set_data("query", formatted_query.sql_data())
        metrics.increment("execute")

    timer.mark("prepare_query")

    stats = {
        "clickhouse_table": source,
        "final": clickhouse_query.get_final(),
        "referrer": referrer,
        "num_days": (to_date - from_date).days,
        "sample": clickhouse_query.get_sample(),
    }

    with sentry_sdk.start_span(
        description=formatted_query.format_sql(), op="db"
    ) as span:
        span.set_tag("table", source)

        return raw_query(
            clickhouse_query,
            request_settings,
            formatted_query,
            reader,
            timer,
            query_metadata,
            stats,
            span.trace_id,
        )
Beispiel #4
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = query.get_data_source().get_prewhere_candidates()
        if not prewhere_keys:
            return

        # While both implementations modify the query, they do not
        # interfere with each other since one depend on the legacy
        # representation and the other on the AST thus we can execute
        # the two independently.
        LegacyPrewhereProcessor().process_query(query, max_prewhere_conditions,
                                                prewhere_keys)
        ASTPrewhereProcessor().process_query(query, max_prewhere_conditions,
                                             prewhere_keys)
Beispiel #5
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        max_prewhere_conditions: int = (self.__max_prewhere_conditions
                                        or settings.MAX_PREWHERE_CONDITIONS)
        prewhere_keys = query.get_data_source().get_prewhere_candidates()
        if not prewhere_keys:
            return

        ast_condition = query.get_condition_from_ast()
        if ast_condition is None:
            return

        prewhere_candidates = [
            (get_columns_in_expression(cond), cond)
            for cond in get_first_level_and_conditions(ast_condition)
            if isinstance(cond, FunctionCall)
            and cond.function_name in ALLOWED_OPERATORS and any(
                col.column_name in prewhere_keys
                for col in get_columns_in_expression(cond))
        ]
        if not prewhere_candidates:
            return

        # Use the condition that has the highest priority (based on the
        # position of its columns in the prewhere keys list)
        sorted_candidates = sorted(
            [(
                min(
                    prewhere_keys.index(col.column_name)
                    for col in cols if col.column_name in prewhere_keys),
                cond,
            ) for cols, cond in prewhere_candidates],
            key=lambda priority_and_col: priority_and_col[0],
        )
        prewhere_conditions = [cond for _, cond in sorted_candidates
                               ][:max_prewhere_conditions]

        new_conditions = [
            cond for cond in get_first_level_and_conditions(ast_condition)
            if cond not in prewhere_conditions
        ]

        query.set_ast_condition(
            combine_and_conditions(new_conditions) if new_conditions else None)
        query.set_prewhere_ast_condition(
            combine_and_conditions(prewhere_conditions
                                   ) if prewhere_conditions else None)
Beispiel #6
0
def _get_table(query: Query) -> str:
    source = query.get_data_source()
    if source is None:
        # Should never happen at this point.
        return ""
    return source.format_from()