def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_column(exp: Expression) -> Expression: if isinstance(exp, Column): if exp.column_name == "group_id": return FunctionCall( exp.alias, "nullIf", ( Column(None, exp.table_name, exp.column_name), Literal(None, 0), ), ) elif exp.column_name == "message": # Because of the rename from message->search_message without backfill, # records will have one or the other of these fields. # TODO this can be removed once all data has search_message filled in. return FunctionCall( exp.alias, "coalesce", ( Column(None, exp.table_name, "search_message"), Column(None, exp.table_name, exp.column_name), ), ) return exp query.transform_expressions(process_column)
def process_query(self, query: Query, query_settings: QuerySettings) -> None: having_clause = query.get_having() if not having_clause: return None selected_columns = query.get_selected_columns() uniq_matcher = Param("function", FunctionCallMatch(String("uniq"))) found_functions = [] for exp in having_clause: match = uniq_matcher.match(exp) if match is not None: found_functions.append(match.expression("function")) if found_functions is not None: matcher = _ExpressionOrAliasMatcher(found_functions) for col in selected_columns: col.expression.accept(matcher) if not all(matcher.found_expressions): should_throw = get_config("throw_on_uniq_select_and_having", False) error = MismatchedAggregationException( "Aggregation is in HAVING clause but not SELECT", query=str(query)) if should_throw: raise error else: logging.warning( "Aggregation is in HAVING clause but not SELECT", exc_info=True, extra=cast(Dict[str, Any], error.to_dict()), )
def test_hexint_column_processor(unprocessed: Expression, formatted_value: str) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")) ], condition=unprocessed, ) HexIntColumnProcessor(set(["column1" ])).process_query(unprocessed_query, HTTPQuerySettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column1", FunctionCall( None, "lower", (FunctionCall( None, "hex", (Column(None, None, "column1"), ), ), ), ), ) ] condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def transform_nested_column(exp: Expression) -> Expression: subscript = match_subscriptable_reference(exp) if subscript is None: return exp if subscript.column_name in self.__specs: promoted_col_name = self.__specs[subscript.column_name].get( subscript.key) if promoted_col_name is not None: col_type = (query.get_from_clause().get_columns().get( promoted_col_name, None)) col_type_name = str(col_type) if col_type else None # We need to pass the content of the promoted column to a toString # function when the promoted column is not a string since the # supported values of mapping columns are strings and the clients # expect such. if not self.__cast_to_string or ( col_type_name and "String" in col_type_name and "FixedString" not in col_type_name): return Column(exp.alias, subscript.table_name, promoted_col_name) else: return FunctionCall( exp.alias, "toString", (Column(None, subscript.table_name, promoted_col_name), ), ) return exp query.transform_expressions(transform_nested_column)
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_condition(exp: Expression) -> Expression: result = CONDITION_PATTERN.match(exp) if result is not None: key_column = result.optional_string(KEY_COL_MAPPING_PARAM) if key_column == "tags.key": rhs = result.optional_string(KEY_MAPPING_PARAM) table_name = result.optional_string(TABLE_MAPPING_PARAM) replacement = FunctionCall( exp.alias, "has", (Column(None, table_name, "tags.key"), Literal(None, rhs)), ) assert isinstance(exp, FunctionCall) if exp.function_name == ConditionFunctions.EQ: replacement = FunctionCall(exp.alias, "not", (replacement,)) prev_value = query.get_experiment_value( "empty-string-tag-condition" ) if prev_value is not None: return replacement if prev_value == "true" else exp if settings.TESTING or random.random() < 0.5: query.add_experiment("empty-string-tag-condition", "true") return replacement else: query.add_experiment("empty-string-tag-condition", "false") return exp condition = query.get_condition() if condition is not None: query.set_ast_condition(condition.transform(process_condition))
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer( "project_id", ReplacerState.EVENTS ).process_query(query, HTTPRequestSettings()) assert query.get_condition_from_ast() == FunctionCall( None, BooleanFunctions.AND, ( FunctionCall( None, "notIn", ( FunctionCall( None, "assumeNotNull", (Column(None, None, "group_id"),) ), FunctionCall( None, "tuple", (Literal(None, 100), Literal(None, 101), Literal(None, 102),), ), ), ), build_in("project_id", [2]), ), ) assert not query.get_from_clause().final
def process_query(self, query: Query, request_settings: RequestSettings,) -> None: from_clause = query.get_data_source() if not isinstance(from_clause, JoinClause): return referenced_columns = query.get_all_referenced_columns() referenced_aliases = set() for qualified_column in referenced_columns: # This will be much better when we will represent columns # with a more structured data type than strings. match = QUALIFIED_COLUMN_REGEX.match(qualified_column) if match: # match[1] is the first parenthesized group in the regex, thus # the table alias. table_alias = match[1] referenced_aliases.add(table_alias) assert ( len(referenced_aliases) > 0 ), "Trying to otpimize a join query without aliases" if len(referenced_aliases) > 1: return from_tables = from_clause.get_tables() table = from_tables[referenced_aliases.pop()] query.set_data_source(table)
def test_query_data_source() -> None: """ Tests using the Query as a data source """ query = Query( Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression( "col1", Column(alias="col1", table_name=None, column_name="col1") ), SelectedExpression( "some_func", FunctionCall( "some_func", "f", (Column(alias="col1", table_name=None, column_name="col1"),), ), ), SelectedExpression( None, Column(alias="col2", table_name=None, column_name="col2") ), ], ) assert query.get_columns() == ColumnSet( [("col1", Any()), ("some_func", Any()), ("_invalid_alias_2", Any())] )
def _replace_ast_condition( query: Query, field: str, operator: str, new_operand: Expression ) -> None: """ Replaces a condition in the top level AND boolean condition in the query WHERE clause. """ def replace_condition(expression: Expression) -> Expression: match = FunctionCall( String(OPERATOR_TO_FUNCTION[operator]), (Param("column", Column(None, String(field))), AnyExpression()), ).match(expression) return ( expression if match is None else replace( expression, parameters=(match.expression("column"), new_operand) ) ) condition = query.get_condition_from_ast() if condition is not None: query.set_ast_condition( combine_and_conditions( [ replace_condition(c) for c in get_first_level_and_conditions(condition) ] ) )
def test_without_turbo_without_projects_needing_final(query: ClickhouseQuery) -> None: PostReplacementConsistencyEnforcer("project_id", None).process_query( query, HTTPRequestSettings() ) assert query.get_condition_from_ast() == build_in("project_id", [2]) assert not query.get_from_clause().final
def test_multiple_not_too_many_excludes( query_with_multiple_group_ids: ClickhouseQuery, ) -> None: """ Query is looking for multiple groups and there are not too many groups to exclude, but there are fewer groups queried for than replaced. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_multiple_group_ids, True) state.set_config("max_group_ids_exclude", 5) enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings()) assert query_with_multiple_group_ids.get_condition() == build_and( build_not_in("group_id", [101, 102]), build_and(build_in("project_id", [2]), build_in("group_id", [101, 102])), ) assert not query_with_multiple_group_ids.get_from_clause().final
def test_single_too_many_exclude( query_with_single_group_id: ClickhouseQuery) -> None: """ Query is looking for a group that has been replaced, and there are too many groups to exclude. """ enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_single_group_id, True) state.set_config("max_group_ids_exclude", 2) enforcer.process_query(query_with_single_group_id, HTTPQuerySettings()) assert query_with_single_group_id.get_condition() == build_and( build_not_in("group_id", [101]), build_and(build_in("project_id", [2]), build_in("group_id", [101])), ) assert not query_with_single_group_id.get_from_clause().final
def process_query(self, query: Query, query_settings: QuerySettings) -> None: def process_condition(exp: Expression) -> Expression: result = CONDITION_PATTERN.match(exp) if result is not None: key_column = result.optional_string(KEY_COL_MAPPING_PARAM) if key_column == "tags.key": rhs = result.optional_string(KEY_MAPPING_PARAM) table_name = result.optional_string(TABLE_MAPPING_PARAM) replacement = FunctionCall( exp.alias, "has", (Column(None, table_name, "tags.key"), Literal(None, rhs)), ) assert isinstance(exp, FunctionCall) if exp.function_name == ConditionFunctions.EQ: replacement = FunctionCall(exp.alias, "not", (replacement, )) return replacement return exp condition = query.get_condition() if condition is not None: query.set_ast_condition(condition.transform(process_condition))
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS).process_query( query, HTTPQuerySettings()) assert query.get_condition() == build_and( FunctionCall( None, "notIn", ( FunctionCall(None, "assumeNotNull", (Column(None, None, "group_id"), )), FunctionCall( None, "tuple", ( Literal(None, 100), Literal(None, 101), Literal(None, 102), ), ), ), ), build_in("project_id", [2]), ) assert not query.get_from_clause().final
def test_query_overlaps_replacements_processor( query: ClickhouseQuery, query_with_timestamp: ClickhouseQuery, query_with_future_timestamp: ClickhouseQuery, ) -> None: enforcer = PostReplacementConsistencyEnforcer("project_id", ReplacerState.ERRORS) # replacement time unknown, default to "overlaps" but no groups to exclude so shouldn't be final enforcer._set_query_final(query_with_timestamp, True) enforcer.process_query(query_with_timestamp, HTTPQuerySettings()) assert not query_with_timestamp.get_from_clause().final # overlaps replacement and should be final due to too many groups to exclude state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups( 2, [100, 101, 102], ReplacerState.ERRORS, ReplacementType. EXCLUDE_GROUPS, # Arbitrary replacement type, no impact on tests ) enforcer._set_query_final(query_with_timestamp, False) enforcer.process_query(query_with_timestamp, HTTPQuerySettings()) assert query_with_timestamp.get_from_clause().final # query time range unknown and should be final due to too many groups to exclude enforcer._set_query_final(query, False) enforcer.process_query(query, HTTPQuerySettings()) assert query.get_from_clause().final # doesn't overlap replacements enforcer._set_query_final(query_with_future_timestamp, True) enforcer.process_query(query_with_future_timestamp, HTTPQuerySettings()) assert not query_with_future_timestamp.get_from_clause().final
def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ array_join_found = any( array_join_pattern(column_name).match(f) is not None for selected in query.get_selected_columns_from_ast() or [] for f in selected.expression) if not array_join_found: return set() ast_condition = query.get_condition_from_ast() cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name) if ast_condition is not None else set()) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return set() ast_having = query.get_having_from_ast() having_keys = (_get_mapping_keys_in_condition(ast_having, column_name) if ast_having is not None else set()) if having_keys is None: # Same as above return set() return cond_keys | having_keys
def test_mand_conditions(table: str, mand_conditions: List[FunctionCall]) -> None: query = Query( Table( table, ColumnSet([]), final=False, sampling_rate=None, mandatory_conditions=mand_conditions, ), None, None, binary_condition( BooleanFunctions.AND, binary_condition( OPERATOR_TO_FUNCTION["="], Column("d", None, "d"), Literal(None, "1"), ), binary_condition( OPERATOR_TO_FUNCTION["="], Column("c", None, "c"), Literal(None, "3"), ), ), ) query_ast_copy = copy.deepcopy(query) request_settings = HTTPRequestSettings(consistent=True) processor = MandatoryConditionApplier() processor.process_query(query, request_settings) query_ast_copy.add_condition_to_ast(combine_and_conditions(mand_conditions)) assert query.get_condition_from_ast() == query_ast_copy.get_condition_from_ast()
def get_filtered_mapping_keys( query: Query, extractors: Sequence[Extractor[T]], is_skippable_condition: Callable[[Expression], bool], ) -> Sequence[T]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ ast_condition = query.get_condition() cond_keys: Optional[Set[T]] = ( get_mapping_keys_in_condition(ast_condition, extractors, is_skippable_condition) if ast_condition is not None else set() ) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return [] ast_having = query.get_having() having_keys: Optional[Set[T]] = ( get_mapping_keys_in_condition(ast_having, extractors, is_skippable_condition) if ast_having is not None else set() ) if having_keys is None: # Same as above return [] keys = cond_keys | having_keys return sorted(list(keys))
def process_query(self, query: Query, request_settings: RequestSettings) -> None: missing_checkers = {checker for checker in self.__condition_checkers} def inspect_expression(condition: Expression) -> None: top_level = get_first_level_and_conditions(condition) for condition in top_level: for checker in self.__condition_checkers: if checker in missing_checkers: if checker.check(condition): missing_checkers.remove(checker) condition = query.get_condition() if condition is not None: inspect_expression(condition) prewhere = query.get_prewhere_ast() if prewhere is not None: inspect_expression(prewhere) missing_ids = {checker.get_id() for checker in missing_checkers} if get_config("mandatory_condition_enforce", 0): assert ( not missing_checkers ), f"Missing mandatory columns in query. Missing {missing_ids}" else: if missing_checkers: logger.error( "Query is missing mandatory columns", extra={"missing_checkers": missing_ids}, )
def _replace_condition(query: Query, field: str, operator: str, new_literal: Union[str, List[AnyType]]) -> None: query.set_conditions([ cond if not _identify_condition(cond, field, operator) else [field, operator, new_literal] for cond in query.get_conditions() or [] ])
def process_query(self, query: Query, query_settings: QuerySettings) -> None: mandatory_conditions = query.get_from_clause().mandatory_conditions if len(mandatory_conditions) > 0: query.add_condition_to_ast( combine_and_conditions(mandatory_conditions))
def test_format_expressions( name: str, query: ClickhouseQuery, expected_query: ClickhouseQuery ) -> None: MappingColumnPromoter({"tags": {"promoted_tag": "promoted"}}).process_query( query, HTTPQuerySettings() ) assert query.get_selected_columns() == expected_query.get_selected_columns()
def _set_query_final(self, query: Query, final: bool) -> None: """ Set the 'final' clause of a Query. A query set as final will force ClickHouse to perform a merge on the results of the query. This is very performance heavy and should be avoided whenever possible. """ query.set_from_clause(replace(query.get_from_clause(), final=final))
def process_query(self, query: Query, query_settings: QuerySettings) -> None: array_joins_in_query = self.__get_array_joins_in_query(query) tuple_alias = self.__get_unused_alias(query) single_filtered, multiple_filtered = self.get_filtered_arrays( query, self.key_columns) def replace_expression(expr: Expression) -> Expression: match = self.__array_join_pattern.match(expr) # The arrayJoins we are looking for are not present, so skip this entirely if match is None: return expr # All of the possible array joins are present if array_joins_in_query == set(self.all_columns): tuple_index = self.__find_tuple_index(match.string("col")) single_index_filtered = { self.__find_tuple_index(column_name): filtered for column_name, filtered in single_filtered.items() } multiple_indices_filtered = { tuple( self.__find_tuple_index(column) for column in column_names): filtered for column_names, filtered in multiple_filtered.items() } if single_filtered or multiple_filtered: return filtered_mapping_tuples( expr.alias, tuple_alias, tuple_index, self.all_columns, single_index_filtered, multiple_indices_filtered, ) return unfiltered_mapping_tuples(expr.alias, tuple_alias, tuple_index, self.all_columns) # Only array join present is one of the key columns elif len(array_joins_in_query) == 1 and any( column in array_joins_in_query for column in self.key_columns): column_name = array_joins_in_query.pop() if column_name in single_filtered: return filtered_mapping_keys(expr.alias, column_name, single_filtered[column_name]) # No viable optimization return expr query.transform_expressions(replace_expression)
def _update_conditions(self, query: Query, prewhere_conditions: Sequence[Condition]) -> None: conditions = query.get_conditions() # This should never ne None at this point, but for mypy this can be None. assert conditions is not None query.set_conditions( [cond for cond in conditions if cond not in prewhere_conditions]) query.set_prewhere(prewhere_conditions)
def process_query(self, query: Query, query_settings: QuerySettings) -> None: single_filtered, multiple_filtered = self.get_filtered_arrays( query, self.key_columns) bloom_filter_condition = generate_bloom_filter_condition( self.column_name, single_filtered, multiple_filtered) if bloom_filter_condition: query.add_condition_to_ast(bloom_filter_condition)
def test_tags_processor(query_body: MutableMapping[str, Any], expected_query: ClickhouseQuery) -> None: """ Tests the whole processing in some notable cases. """ processed = parse_and_process(query_body) assert processed.get_selected_columns( ) == expected_query.get_selected_columns() assert processed.get_condition() == expected_query.get_condition() assert processed.get_having() == expected_query.get_having()
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 2) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer( "project_id", ReplacerState.EVENTS ).process_query(query, HTTPRequestSettings()) assert query.get_condition_from_ast() == build_in("project_id", [2]) assert query.get_from_clause().final
def test_without_turbo_with_projects_needing_final( query: ClickhouseQuery) -> None: set_project_needs_final(2, ReplacerState.EVENTS) PostReplacementConsistencyEnforcer("project_id", ReplacerState.EVENTS).process_query( query, HTTPRequestSettings()) assert query.get_condition_from_ast() == build_in("project_id", [2]) assert query.get_final()
def _list_array_join(query: Query) -> Columnset: ret = set() query_arrayjoin = query.get_arrayjoin_from_ast() if query_arrayjoin is not None: ret |= _get_columns_from_expression(query_arrayjoin) for e in query.get_all_expressions(): if isinstance(e, FunctionCallExpr) and e.function_name == "arrayJoin": ret |= _get_columns_from_expression(e) return ret