def __init__(self, query: Query, settings: RequestSettings,) -> None: # Clickhouse query structure # Referencing them here directly since it makes it easier # to process this query independently from the Clickhouse Query # and there is no risk in doing so since they are immutable. self.__selected_columns = query.get_selected_columns_from_ast() self.__condition = query.get_condition_from_ast() self.__groupby = query.get_groupby_from_ast() self.__having = query.get_having_from_ast() self.__orderby = query.get_orderby_from_ast() self.__data_source = query.get_data_source() self.__arrayjoin = query.get_arrayjoin_from_ast() self.__granularity = query.get_granularity() self.__limit = query.get_limit() self.__limitby = query.get_limitby() self.__offset = query.get_offset() if self.__having: assert self.__groupby, "found HAVING clause with no GROUP BY" self.__turbo = settings.get_turbo() self.__final = query.get_final() self.__sample = query.get_sample() self.__hastotals = query.has_totals() self.__prewhere = query.get_prewhere_ast() self.__settings = settings self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None self.__formatted_query: Optional[str] = None self.__sql_data: Optional[Mapping[str, str]] = None
def process_query(self, query: Query, request_settings: RequestSettings,) -> None: from_clause = query.get_data_source() if not isinstance(from_clause, JoinClause): return referenced_columns = query.get_all_referenced_columns() referenced_aliases = set() for qualified_column in referenced_columns: # This will be much better when we will represent columns # with a more structured data type than strings. match = QUALIFIED_COLUMN_REGEX.match(qualified_column) if match: # match[1] is the first parenthesized group in the regex, thus # the table alias. table_alias = match[1] referenced_aliases.add(table_alias) assert ( len(referenced_aliases) > 0 ), "Trying to otpimize a join query without aliases" if len(referenced_aliases) > 1: return from_tables = from_clause.get_tables() table = from_tables[referenced_aliases.pop()] query.set_data_source(table)
def _format_storage_query_and_run( timer: Timer, query_metadata: SnubaQueryMetadata, from_date: datetime, to_date: datetime, referrer: str, clickhouse_query: Query, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: """ Formats the Storage Query and pass it to the DB specific code for execution. """ # TODO: This function (well, it will be a wrapper of this function) # where we will transform the result according to the SelectedExpression # object in the query to ensure the fields in the QueryResult have # the same name the user expects. source = clickhouse_query.get_data_source().format_from() with sentry_sdk.start_span(description="create_query", op="db") as span: formatted_query = AstSqlQuery(clickhouse_query, request_settings) span.set_data("query", formatted_query.sql_data()) metrics.increment("execute") timer.mark("prepare_query") stats = { "clickhouse_table": source, "final": clickhouse_query.get_final(), "referrer": referrer, "num_days": (to_date - from_date).days, "sample": clickhouse_query.get_sample(), } with sentry_sdk.start_span( description=formatted_query.format_sql(), op="db" ) as span: span.set_tag("table", source) return raw_query( clickhouse_query, request_settings, formatted_query, reader, timer, query_metadata, stats, span.trace_id, )
def process_query(self, query: Query, request_settings: RequestSettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = query.get_data_source().get_prewhere_candidates() if not prewhere_keys: return # While both implementations modify the query, they do not # interfere with each other since one depend on the legacy # representation and the other on the AST thus we can execute # the two independently. LegacyPrewhereProcessor().process_query(query, max_prewhere_conditions, prewhere_keys) ASTPrewhereProcessor().process_query(query, max_prewhere_conditions, prewhere_keys)
def process_query(self, query: Query, request_settings: RequestSettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = query.get_data_source().get_prewhere_candidates() if not prewhere_keys: return ast_condition = query.get_condition_from_ast() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond)) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys), cond, ) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates ][:max_prewhere_conditions] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)
def _get_table(query: Query) -> str: source = query.get_data_source() if source is None: # Should never happen at this point. return "" return source.format_from()