def __init__(self, query: Query, settings: RequestSettings,) -> None: # Clickhouse query structure # Referencing them here directly since it makes it easier # to process this query independently from the Clickhouse Query # and there is no risk in doing so since they are immutable. self.__selected_columns = query.get_selected_columns_from_ast() self.__condition = query.get_condition_from_ast() self.__groupby = query.get_groupby_from_ast() self.__having = query.get_having_from_ast() self.__orderby = query.get_orderby_from_ast() self.__data_source = query.get_data_source() self.__arrayjoin = query.get_arrayjoin_from_ast() self.__granularity = query.get_granularity() self.__limit = query.get_limit() self.__limitby = query.get_limitby() self.__offset = query.get_offset() if self.__having: assert self.__groupby, "found HAVING clause with no GROUP BY" self.__turbo = settings.get_turbo() self.__final = query.get_final() self.__sample = query.get_sample() self.__hastotals = query.has_totals() self.__prewhere = query.get_prewhere_ast() self.__settings = settings self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None self.__formatted_query: Optional[str] = None self.__sql_data: Optional[Mapping[str, str]] = None
def test_translation(mappers: TranslationMappers, query: SnubaQuery, expected: ClickhouseQuery) -> None: translated = QueryTranslator(mappers).translate(query) # TODO: consider providing an __eq__ method to the Query class. Or turn it into # a dataclass. assert (expected.get_selected_columns_from_ast() == translated.get_selected_columns_from_ast()) assert expected.get_groupby_from_ast() == translated.get_groupby_from_ast() assert expected.get_condition_from_ast( ) == translated.get_condition_from_ast() assert expected.get_arrayjoin_from_ast( ) == translated.get_arrayjoin_from_ast() assert expected.get_having_from_ast() == translated.get_having_from_ast() assert expected.get_orderby_from_ast() == translated.get_orderby_from_ast()
def generate_profile(query: Query) -> ClickhouseQueryProfile: """ Takes a Physical query in, analyzes it and produces the ClickhouseQueryProfile data structure. """ where = query.get_condition_from_ast() groupby = query.get_groupby_from_ast() try: return ClickhouseQueryProfile( time_range=_get_date_range(query), table=_get_table(query), all_columns=_get_all_columns(query), multi_level_condition=_has_complex_conditions(query), where_profile=FilterProfile( columns=_list_columns(where) if where is not None else set(), mapping_cols=_list_mapping(where) if where is not None else set(), ), groupby_cols=_list_groupby_columns(groupby) if groupby is not None else set(), array_join_cols=_list_array_join(query), ) except Exception: # Should never happen, but it is not worth failing queries while # rolling this out because we cannot build he profile. logger.warning("Failed to build query profile", exc_info=True) return ClickhouseQueryProfile( time_range=-1, table="", all_columns=set(), multi_level_condition=False, where_profile=FilterProfile( columns=set(), mapping_cols=set(), ), groupby_cols=set(), array_join_cols=set(), )
def process_query(self, query: Query, request_settings: RequestSettings) -> None: conditions = query.get_conditions() if not conditions: return # Enable the processor only if we have enough data in the flattened # columns. Which have been deployed at BEGINNING_OF_TIME. If the query # starts earlier than that we do not apply the optimization. if self.__beginning_of_time: apply_optimization = False for condition in conditions: if (is_condition(condition) and isinstance(condition[0], str) and condition[0] in self.__timestamp_cols and condition[1] in (">=", ">") and isinstance(condition[2], str)): try: start_ts = parse_datetime(condition[2]) if (start_ts - self.__beginning_of_time).total_seconds() > 0: apply_optimization = True except Exception: # We should not get here, it means the from timestamp is malformed # Returning here is just for safety logger.error( "Cannot parse start date for NestedFieldOptimizer: %r", condition, ) return if not apply_optimization: return # Do not use flattened tags if tags are being unpacked anyway. In that case # using flattened tags only implies loading an additional column thus making # the query heavier and slower if self.__has_tags(query.get_arrayjoin_from_ast()): return if query.get_groupby_from_ast(): for expression in query.get_groupby_from_ast(): if self.__has_tags(expression): return if self.__has_tags(query.get_having_from_ast()): return if query.get_orderby_from_ast(): for orderby in query.get_orderby_from_ast(): if self.__has_tags(orderby.expression): return new_conditions = [] positive_like_expression: List[str] = [] negative_like_expression: List[str] = [] for c in conditions: keyvalue = self.__is_optimizable(c, self.__nested_col) if not keyvalue: new_conditions.append(c) else: expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}" if keyvalue.operand == Operand.EQ: positive_like_expression.append(expression) else: negative_like_expression.append(expression) if positive_like_expression: # Positive conditions "=" are all merged together in one LIKE expression positive_like_expression = sorted(positive_like_expression) like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%" new_conditions.append( [self.__flattened_col, "LIKE", like_formatted]) for expression in negative_like_expression: # Negative conditions "!=" cannot be merged together. We can still transform # them into NOT LIKE statements, but each condition has to be one # statement. not_like_formatted = f"%|{expression}|%" new_conditions.append( [self.__flattened_col, "NOT LIKE", not_like_formatted]) query.set_conditions(new_conditions)
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ If a query is: - ORDER BY timestamp DESC - has no grouping - has an offset/limit - has a large time range We know we have to reverse-sort the entire set of rows to return the small chunk at the end of the time range, so optimistically split the time range into smaller increments, and start with the last one, so that we can potentially avoid querying the entire range. """ limit = query.get_limit() if limit is None or query.get_groupby_from_ast(): return None if query.get_offset() >= 1000: return None orderby = query.get_orderby_from_ast() if ( not orderby or orderby[0].direction != OrderByDirection.DESC or not isinstance(orderby[0].expression, ColumnExpr) or not orderby[0].expression.column_name == self.__timestamp_col ): return None from_date_ast, to_date_ast = get_time_range(query, self.__timestamp_col) if from_date_ast is None or to_date_ast is None: return None date_align, split_step = state.get_configs( [("date_align_seconds", 1), ("split_step", 3600)] # default 1 hour ) remaining_offset = query.get_offset() overall_result = None split_end = to_date_ast split_start = max(split_end - timedelta(seconds=split_step), from_date_ast) total_results = 0 while split_start < split_end and total_results < limit: # We need to make a copy to use during the query execution because we replace # the start-end conditions on the query at each iteration of this loop. split_query = copy.deepcopy(query) _replace_ast_condition( split_query, self.__timestamp_col, ">=", LiteralExpr(None, split_start) ) _replace_ast_condition( split_query, self.__timestamp_col, "<", LiteralExpr(None, split_end) ) # Because its paged, we have to ask for (limit+offset) results # and set offset=0 so we can then trim them ourselves. split_query.set_offset(0) split_query.set_limit(limit - total_results + remaining_offset) # At every iteration we only append the "data" key from the results returned by # the runner. The "extra" key is only populated at the first iteration of the # loop and never changed. result = runner(split_query, request_settings) if overall_result is None: overall_result = result else: overall_result.result["data"].extend(result.result["data"]) if remaining_offset > 0 and len(overall_result.result["data"]) > 0: to_trim = min(remaining_offset, len(overall_result.result["data"])) overall_result.result["data"] = overall_result.result["data"][to_trim:] remaining_offset -= to_trim total_results = len(overall_result.result["data"]) if total_results < limit: if len(result.result["data"]) == 0: # If we got nothing from the last query, expand the range by a static factor split_step = split_step * STEP_GROWTH else: # If we got some results but not all of them, estimate how big the time # range should be for the next query based on how many results we got for # our last query and its time range, and how many we have left to fetch. remaining = limit - total_results split_step = split_step * math.ceil( remaining / float(len(result.result["data"])) ) # Set the start and end of the next query based on the new range. split_end = split_start try: split_start = max( split_end - timedelta(seconds=split_step), from_date_ast ) except OverflowError: split_start = from_date_ast return overall_result
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if ( limit is None or limit == 0 or query.get_groupby_from_ast() or not query.get_selected_columns_from_ast() ): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), (Column(None, String(self.__id_column)), AnyExpression(),), ) for expr in query.get_condition_from_ast() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. total_columns = { (col.table_name, col.column_name) for col in query.get_all_ast_referenced_columns() } minimal_query = copy.deepcopy(query) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns( [ SelectedExpression( self.__id_column, ColumnExpr(self.__id_column, None, self.__id_column), ), SelectedExpression( self.__project_column, ColumnExpr(self.__project_column, None, self.__project_column), ), SelectedExpression( self.__timestamp_column, ColumnExpr(self.__timestamp_column, None, self.__timestamp_column), ), ] ) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) minimal_columns = { (col.table_name, col.column_name) for col in minimal_query.get_all_ast_referenced_columns() } if len(total_columns) <= len(minimal_columns): return None # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None result = runner(minimal_query, request_settings) del minimal_query if not result.result["data"]: return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]]) ) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment("column_splitter.intermediate_results_beyond_limit") return None query.add_condition_to_ast( in_condition( None, ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], ) ) query.set_offset(0) # TODO: This is technically wrong. Event ids are unique per project, not globally. # So, if the minimal query only returned the same event_id from two projects, we # would be underestimating the limit here. query.set_limit(len(event_ids)) project_ids = list( set([event[self.__project_column] for event in result.result["data"]]) ) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [event[self.__timestamp_column] for event in result.result["data"]] _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, request_settings)
def test_replace_expression() -> None: """ Create a query with the new AST and replaces a function with a different function replaces f1(...) with tag(f1) """ column1 = Column(None, "t1", "c1") column2 = Column(None, "t1", "c2") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2, )) condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1")) prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2")) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( Table("my_table", ColumnSet([])), selected_columns=[SelectedExpression("alias", function_1)], array_join=None, condition=condition, groupby=[function_1], having=None, prewhere=prewhere, order_by=[orderby], ) def replace(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "f1": return FunctionCall(exp.alias, "tag", (Literal(None, "f1"), )) return exp query.transform_expressions(replace) expected_query = Query( Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression( "alias", FunctionCall("alias", "tag", (Literal(None, "f1"), ))) ], array_join=None, condition=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"), )), Literal(None, "1"), ), groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"), ))], prewhere=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"), )), Literal(None, "2"), ), having=None, order_by=[orderby], ) assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( ) assert list(query.get_all_expressions()) == list( expected_query.get_all_expressions())