def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer("project_id", ReplacerState.EVENTS).process_query( query, HTTPRequestSettings()) assert query.get_conditions() == [("project_id", "IN", [2])] assert query.get_condition_from_ast() == build_in("project_id", [2]) assert query.get_final()
def process_query(self, query: Query, request_settings: RequestSettings) -> None: condition = query.get_condition_from_ast() if condition: query.set_ast_condition(condition.transform(self.process_condition)) prewhere = query.get_prewhere_ast() if prewhere: query.set_prewhere_ast_condition(prewhere.transform(self.process_condition)) if self.formatted: metrics.increment("query_processed", tags={"type": self.formatted})
def test_tags_hash_map( query: ClickhouseQuery, expected_condition: Expression, ) -> None: set_config("tags_hash_map_enabled", 1) MappingOptimizer( column_name="tags", hash_map_name="_tags_hash_map", killswitch="tags_hash_map_enabled", ).process_query(query, HTTPRequestSettings()) assert query.get_condition_from_ast() == expected_condition
def test_tags_processor(query_body: MutableMapping[str, Any], expected_query: ClickhouseQuery) -> None: """ Tests the whole processing in some notable cases. """ processed = parse_and_process(query_body) assert (processed.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert processed.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert processed.get_having_from_ast( ) == expected_query.get_having_from_ast()
def _get_prewhere_candidates( self, query: Query, prewhere_keys: Sequence[str] ) -> Sequence[Tuple[Iterable[Column], Expression]]: # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in prewhere_keys ast_condition = query.get_condition_from_ast() return ([(get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in self.allowed_ast_operators and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond))] if ast_condition is not None else [])
def test_translation(mappers: TranslationMappers, query: SnubaQuery, expected: ClickhouseQuery) -> None: translated = QueryTranslator(mappers).translate(query) # TODO: consider providing an __eq__ method to the Query class. Or turn it into # a dataclass. assert (expected.get_selected_columns_from_ast() == translated.get_selected_columns_from_ast()) assert expected.get_groupby_from_ast() == translated.get_groupby_from_ast() assert expected.get_condition_from_ast( ) == translated.get_condition_from_ast() assert expected.get_arrayjoin_from_ast( ) == translated.get_arrayjoin_from_ast() assert expected.get_having_from_ast() == translated.get_having_from_ast() assert expected.get_orderby_from_ast() == translated.get_orderby_from_ast()
def _update_conditions(self, query: Query, prewhere_conditions: Sequence[Expression]) -> None: ast_condition = query.get_condition_from_ast() # This should never be None at this point, but for mypy this can be None. assert ast_condition is not None new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)
def process_query(self, query: Query, request_settings: RequestSettings) -> None: max_prewhere_conditions: int = (self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS) prewhere_keys = query.get_from_clause().get_prewhere_candidates() if not prewhere_keys: return ast_condition = query.get_condition_from_ast() if ast_condition is None: return prewhere_candidates = [ (get_columns_in_expression(cond), cond) for cond in get_first_level_and_conditions(ast_condition) if isinstance(cond, FunctionCall) and cond.function_name in ALLOWED_OPERATORS and any( col.column_name in prewhere_keys for col in get_columns_in_expression(cond)) ] if not prewhere_candidates: return # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) sorted_candidates = sorted( [( min( prewhere_keys.index(col.column_name) for col in cols if col.column_name in prewhere_keys), cond, ) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0], ) prewhere_conditions = [cond for _, cond in sorted_candidates ][:max_prewhere_conditions] new_conditions = [ cond for cond in get_first_level_and_conditions(ast_condition) if cond not in prewhere_conditions ] query.set_ast_condition( combine_and_conditions(new_conditions) if new_conditions else None) query.set_prewhere_ast_condition( combine_and_conditions(prewhere_conditions ) if prewhere_conditions else None)
def get_time_range( query: Query, timestamp_field: str) -> Tuple[Optional[datetime], Optional[datetime]]: """ Finds the minimal time range for this query. Which means, it finds the >= timestamp condition with the highest datetime literal and the < timestamp condition with the smallest and returns the interval in the form of a tuple of Literals. It only looks into first level AND conditions since, if the timestamp is nested in an OR we cannot say anything on how that compares to the other timestamp conditions. """ condition_clause = query.get_condition_from_ast() if not condition_clause: return (None, None) max_lower_bound = None min_upper_bound = None for c in get_first_level_and_conditions(condition_clause): match = FunctionCall( None, Param( "operator", Or([ String(OPERATOR_TO_FUNCTION[">="]), String(OPERATOR_TO_FUNCTION["<"]), ]), ), ( Column(None, None, String(timestamp_field)), Literal(None, Param("timestamp", Any(datetime))), ), ).match(c) if match is not None: timestamp = cast(datetime, match.scalar("timestamp")) if match.string("operator") == OPERATOR_TO_FUNCTION[">="]: if not max_lower_bound or timestamp > max_lower_bound: max_lower_bound = timestamp else: if not min_upper_bound or timestamp < min_upper_bound: min_upper_bound = timestamp return (max_lower_bound, min_upper_bound)
def test_mand_conditions(table: str, mand_conditions: List[FunctionCall]) -> None: query = Query( Table( table, ColumnSet([]), final=False, sampling_rate=None, mandatory_conditions=mand_conditions, prewhere_candidates=["c1"], ), None, None, binary_condition( BooleanFunctions.AND, binary_condition( OPERATOR_TO_FUNCTION["="], Column("d", None, "d"), Literal(None, "1"), ), binary_condition( OPERATOR_TO_FUNCTION["="], Column("c", None, "c"), Literal(None, "3"), ), ), ) query_ast_copy = copy.deepcopy(query) request_settings = HTTPRequestSettings(consistent=True) processor = MandatoryConditionApplier() processor.process_query(query, request_settings) query_ast_copy.add_condition_to_ast( combine_and_conditions(mand_conditions)) assert query.get_condition_from_ast( ) == query_ast_copy.get_condition_from_ast()
def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ array_join_found = any( array_join_pattern(column_name).match(f) is not None for selected in query.get_selected_columns_from_ast() or [] for f in selected.expression ) if not array_join_found: return set() ast_condition = query.get_condition_from_ast() cond_keys = ( _get_mapping_keys_in_condition(ast_condition, column_name) if ast_condition is not None else set() ) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return set() ast_having = query.get_having_from_ast() having_keys = ( _get_mapping_keys_in_condition(ast_having, column_name) if ast_having is not None else set() ) if having_keys is None: # Same as above return set() return cond_keys | having_keys
def generate_profile(query: Query) -> ClickhouseQueryProfile: """ Takes a Physical query in, analyzes it and produces the ClickhouseQueryProfile data structure. """ where = query.get_condition_from_ast() groupby = query.get_groupby_from_ast() try: return ClickhouseQueryProfile( time_range=_get_date_range(query), table=_get_table(query), all_columns=_get_all_columns(query), multi_level_condition=_has_complex_conditions(query), where_profile=FilterProfile( columns=_list_columns(where) if where is not None else set(), mapping_cols=_list_mapping(where) if where is not None else set(), ), groupby_cols=_list_groupby_columns(groupby) if groupby is not None else set(), array_join_cols=_list_array_join(query), ) except Exception: # Should never happen, but it is not worth failing queries while # rolling this out because we cannot build he profile. logger.warning("Failed to build query profile", exc_info=True) return ClickhouseQueryProfile( time_range=-1, table="", all_columns=set(), multi_level_condition=False, where_profile=FilterProfile( columns=set(), mapping_cols=set(), ), groupby_cols=set(), array_join_cols=set(), )
def _replace_ast_condition(query: Query, field: str, operator: str, new_operand: Expression) -> None: """ Replaces a condition in the top level AND boolean condition in the query WHERE clause. """ def replace_condition(expression: Expression) -> Expression: match = FunctionCall( String(OPERATOR_TO_FUNCTION[operator]), (Param("column", Column(None, String(field))), AnyExpression()), ).match(expression) return (expression if match is None else replace( expression, parameters=(match.expression("column"), new_operand))) condition = query.get_condition_from_ast() if condition is not None: query.set_ast_condition( combine_and_conditions([ replace_condition(c) for c in get_first_level_and_conditions(condition) ]))
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer("project_id", ReplacerState.EVENTS).process_query( query, HTTPRequestSettings()) expected = [ ("project_id", "IN", [2]), (["assumeNotNull", ["group_id"]], "NOT IN", [100, 101, 102]), ] assert query.get_conditions() == expected assert query.get_condition_from_ast() == FunctionCall( None, BooleanFunctions.AND, ( FunctionCall( None, "notIn", ( FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), FunctionCall( None, "tuple", ( Literal(None, 100), Literal(None, 101), Literal(None, 102), ), ), ), ), build_in("project_id", [2]), ), ) assert not query.get_final()
def test_with_turbo(query: ClickhouseQuery) -> None: PostReplacementConsistencyEnforcer("project_id", None).process_query( query, HTTPRequestSettings(turbo=True) ) assert query.get_condition_from_ast() == build_in("project_id", [2])
def test_replace_expression() -> None: """ Create a query with the new AST and replaces a function with a different function replaces f1(...) with tag(f1) """ column1 = Column(None, "t1", "c1") column2 = Column(None, "t1", "c2") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2, )) condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1")) prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2")) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( Table("my_table", ColumnSet([])), selected_columns=[SelectedExpression("alias", function_1)], array_join=None, condition=condition, groupby=[function_1], having=None, prewhere=prewhere, order_by=[orderby], ) def replace(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "f1": return FunctionCall(exp.alias, "tag", (Literal(None, "f1"), )) return exp query.transform_expressions(replace) expected_query = Query( Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression( "alias", FunctionCall("alias", "tag", (Literal(None, "f1"), ))) ], array_join=None, condition=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"), )), Literal(None, "1"), ), groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"), ))], prewhere=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"), )), Literal(None, "2"), ), having=None, order_by=[orderby], ) assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( ) assert list(query.get_all_expressions()) == list( expected_query.get_all_expressions())
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id, project_id and timestamp. - Second query selects all fields for only those events. - Shrink the date range. """ limit = query.get_limit() if (limit is None or limit == 0 or query.get_groupby() or query.get_aggregations() or not query.get_selected_columns()): return None if limit > settings.COLUMN_SPLIT_MAX_LIMIT: metrics.increment("column_splitter.query_above_limit") return None # Do not split if there is already a = or IN condition on an ID column id_column_matcher = FunctionCall( Or([String(ConditionFunctions.EQ), String(ConditionFunctions.IN)]), ( Column(None, String(self.__id_column)), AnyExpression(), ), ) for expr in query.get_condition_from_ast() or []: match = id_column_matcher.match(expr) if match: return None # We need to count the number of table/column name pairs # not the number of distinct Column objects in the query # so to avoid counting aliased columns multiple times. total_columns = {(col.table_name, col.column_name) for col in query.get_all_ast_referenced_columns()} minimal_query = copy.deepcopy(query) minimal_query.set_selected_columns( [self.__id_column, self.__project_column, self.__timestamp_column]) # TODO: provide the table alias name to this splitter if we ever use it # in joins. minimal_query.set_ast_selected_columns([ SelectedExpression(self.__id_column, ColumnExpr(None, None, self.__id_column)), SelectedExpression(self.__project_column, ColumnExpr(None, None, self.__project_column)), SelectedExpression( self.__timestamp_column, ColumnExpr(None, None, self.__timestamp_column), ), ]) for exp in minimal_query.get_all_expressions(): if exp.alias in ( self.__id_column, self.__project_column, self.__timestamp_column, ) and not (isinstance(exp, ColumnExpr) and exp.column_name == exp.alias): logger.warning( "Potential alias shadowing due to column splitter", extra={"expression": exp}, exc_info=True, ) minimal_columns = { (col.table_name, col.column_name) for col in minimal_query.get_all_ast_referenced_columns() } if len(total_columns) <= len(minimal_columns): return None # Ensures the AST minimal query is actually runnable on its own. if not minimal_query.validate_aliases(): return None legacy_references = set(minimal_query.get_all_referenced_columns()) ast_column_names = { c.column_name for c in minimal_query.get_all_ast_referenced_columns() } # Ensures the legacy minimal query (which does not expand alias references) # does not contain alias references we removed when creating minimal_query. if legacy_references - ast_column_names: metrics.increment("columns.skip_invalid_legacy_query") return None result = runner(minimal_query, request_settings) del minimal_query if not result.result["data"]: return None # Making a copy just in case runner returned None (which would drive the execution # strategy to ignore the result of this splitter and try the next one). query = copy.deepcopy(query) event_ids = list( set([event[self.__id_column] for event in result.result["data"]])) if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS: # We may be runing a query that is beyond clickhouse maximum query size, # so we cowardly abandon. metrics.increment( "column_splitter.intermediate_results_beyond_limit") return None query.add_conditions([(self.__id_column, "IN", event_ids)]) query.add_condition_to_ast( in_condition( None, ColumnExpr(None, None, self.__id_column), [LiteralExpr(None, e_id) for e_id in event_ids], )) query.set_offset(0) # TODO: This is technically wrong. Event ids are unique per project, not globally. # So, if the minimal query only returned the same event_id from two projects, we # would be underestimating the limit here. query.set_limit(len(event_ids)) project_ids = list( set([ event[self.__project_column] for event in result.result["data"] ])) _replace_condition( query, self.__project_column, "IN", project_ids, ) _replace_ast_condition( query, self.__project_column, "IN", literals_tuple(None, [LiteralExpr(None, p_id) for p_id in project_ids]), ) timestamps = [ event[self.__timestamp_column] for event in result.result["data"] ] _replace_condition( query, self.__timestamp_column, ">=", util.parse_datetime(min(timestamps)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, ">=", LiteralExpr(None, util.parse_datetime(min(timestamps))), ) # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. _replace_condition( query, self.__timestamp_column, "<", (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat(), ) _replace_ast_condition( query, self.__timestamp_column, "<", LiteralExpr( None, (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)), ), ) return runner(query, request_settings)
def get_project_ids_in_query_ast(query: Query, project_column: str) -> Optional[Set[int]]: """ Finds the project ids this query is filtering according to the AST query representation. It works like get_project_ids_in_query with the exception that boolean functions are supported here. """ def get_project_ids_in_condition( condition: Expression) -> Optional[Set[int]]: """ Extract project ids from an expression. Returns None if no project if condition is found. It returns an empty set of conflicting project_id conditions are found. """ match = FunctionCall( None, String(ConditionFunctions.EQ), ( Column(column_name=String(project_column)), Literal(value=Param("project_id", Any(int))), ), ).match(condition) if match is not None: return {match.integer("project_id")} match = is_in_condition_pattern( Column(column_name=String(project_column))).match(condition) if match is not None: projects = match.expression("tuple") assert isinstance(projects, FunctionCallExpr) return { l.value for l in projects.parameters if isinstance(l, LiteralExpr) and isinstance(l.value, int) } match = FunctionCall( None, Param( "operator", Or([String(BooleanFunctions.AND), String(BooleanFunctions.OR)]), ), (Param("lhs", AnyExpression()), Param("rhs", AnyExpression())), ).match(condition) if match is not None: lhs_projects = get_project_ids_in_condition( match.expression("lhs")) rhs_projects = get_project_ids_in_condition( match.expression("rhs")) if lhs_projects is None: return rhs_projects elif rhs_projects is None: return lhs_projects else: return (lhs_projects & rhs_projects if match.string("operator") == BooleanFunctions.AND else lhs_projects | rhs_projects) return None condition = query.get_condition_from_ast() return get_project_ids_in_condition( condition) if condition is not None else None