def __init__(self, query: Query, settings: RequestSettings,) -> None: # Clickhouse query structure # Referencing them here directly since it makes it easier # to process this query independently from the Clickhouse Query # and there is no risk in doing so since they are immutable. self.__selected_columns = query.get_selected_columns_from_ast() self.__condition = query.get_condition_from_ast() self.__groupby = query.get_groupby_from_ast() self.__having = query.get_having_from_ast() self.__orderby = query.get_orderby_from_ast() self.__data_source = query.get_data_source() self.__arrayjoin = query.get_arrayjoin_from_ast() self.__granularity = query.get_granularity() self.__limit = query.get_limit() self.__limitby = query.get_limitby() self.__offset = query.get_offset() if self.__having: assert self.__groupby, "found HAVING clause with no GROUP BY" self.__turbo = settings.get_turbo() self.__final = query.get_final() self.__sample = query.get_sample() self.__hastotals = query.has_totals() self.__prewhere = query.get_prewhere_ast() self.__settings = settings self.__sql_data_list: Optional[Sequence[Tuple[str, str]]] = None self.__formatted_query: Optional[str] = None self.__sql_data: Optional[Mapping[str, str]] = None
def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]: """ Identifies the conditions we can apply the arrayFilter optimization on. Which means: if the arrayJoin is in the select clause, there are one or more top level AND condition on the arrayJoin and there is no OR condition in the query. """ array_join_found = any( array_join_pattern(column_name).match(f) is not None for selected in query.get_selected_columns_from_ast() or [] for f in selected.expression) if not array_join_found: return set() ast_condition = query.get_condition_from_ast() cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name) if ast_condition is not None else set()) if cond_keys is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return set() ast_having = query.get_having_from_ast() having_keys = (_get_mapping_keys_in_condition(ast_having, column_name) if ast_having is not None else set()) if having_keys is None: # Same as above return set() return cond_keys | having_keys
def process_query(self, query: Query, request_settings: RequestSettings) -> None: if not get_config(self.__killswitch, 1): return cond_class = ConditionClass.IRRELEVANT condition = query.get_condition_from_ast() if condition is not None: cond_class = self.__classify_combined_conditions(condition) if cond_class == ConditionClass.NOT_OPTIMIZABLE: return having_cond_class = ConditionClass.IRRELEVANT having_cond = query.get_having_from_ast() if having_cond is not None: having_cond_class = self.__classify_combined_conditions( having_cond) if having_cond_class == ConditionClass.NOT_OPTIMIZABLE: return if not (cond_class == ConditionClass.OPTIMIZABLE or having_cond_class == ConditionClass.OPTIMIZABLE): return metrics.increment("optimizable_query") if condition is not None: query.set_ast_condition( condition.transform(self.__replace_with_hash)) if having_cond is not None: query.set_ast_having( having_cond.transform(self.__replace_with_hash))
def test_tags_processor(query_body: MutableMapping[str, Any], expected_query: ClickhouseQuery) -> None: """ Tests the whole processing in some notable cases. """ processed = parse_and_process(query_body) assert (processed.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert processed.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert processed.get_having_from_ast( ) == expected_query.get_having_from_ast()
def test_translation(mappers: TranslationMappers, query: SnubaQuery, expected: ClickhouseQuery) -> None: translated = QueryTranslator(mappers).translate(query) # TODO: consider providing an __eq__ method to the Query class. Or turn it into # a dataclass. assert (expected.get_selected_columns_from_ast() == translated.get_selected_columns_from_ast()) assert expected.get_groupby_from_ast() == translated.get_groupby_from_ast() assert expected.get_condition_from_ast( ) == translated.get_condition_from_ast() assert expected.get_arrayjoin_from_ast( ) == translated.get_arrayjoin_from_ast() assert expected.get_having_from_ast() == translated.get_having_from_ast() assert expected.get_orderby_from_ast() == translated.get_orderby_from_ast()
def process_query(self, query: Query, request_settings: RequestSettings) -> None: conditions = query.get_conditions() if not conditions: return # Enable the processor only if we have enough data in the flattened # columns. Which have been deployed at BEGINNING_OF_TIME. If the query # starts earlier than that we do not apply the optimization. if self.__beginning_of_time: apply_optimization = False for condition in conditions: if (is_condition(condition) and isinstance(condition[0], str) and condition[0] in self.__timestamp_cols and condition[1] in (">=", ">") and isinstance(condition[2], str)): try: start_ts = parse_datetime(condition[2]) if (start_ts - self.__beginning_of_time).total_seconds() > 0: apply_optimization = True except Exception: # We should not get here, it means the from timestamp is malformed # Returning here is just for safety logger.error( "Cannot parse start date for NestedFieldOptimizer: %r", condition, ) return if not apply_optimization: return # Do not use flattened tags if tags are being unpacked anyway. In that case # using flattened tags only implies loading an additional column thus making # the query heavier and slower if self.__has_tags(query.get_arrayjoin_from_ast()): return if query.get_groupby_from_ast(): for expression in query.get_groupby_from_ast(): if self.__has_tags(expression): return if self.__has_tags(query.get_having_from_ast()): return if query.get_orderby_from_ast(): for orderby in query.get_orderby_from_ast(): if self.__has_tags(orderby.expression): return new_conditions = [] positive_like_expression: List[str] = [] negative_like_expression: List[str] = [] for c in conditions: keyvalue = self.__is_optimizable(c, self.__nested_col) if not keyvalue: new_conditions.append(c) else: expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}" if keyvalue.operand == Operand.EQ: positive_like_expression.append(expression) else: negative_like_expression.append(expression) if positive_like_expression: # Positive conditions "=" are all merged together in one LIKE expression positive_like_expression = sorted(positive_like_expression) like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%" new_conditions.append( [self.__flattened_col, "LIKE", like_formatted]) for expression in negative_like_expression: # Negative conditions "!=" cannot be merged together. We can still transform # them into NOT LIKE statements, but each condition has to be one # statement. not_like_formatted = f"%|{expression}|%" new_conditions.append( [self.__flattened_col, "NOT LIKE", not_like_formatted]) query.set_conditions(new_conditions)
def test_replace_expression() -> None: """ Create a query with the new AST and replaces a function with a different function replaces f1(...) with tag(f1) """ column1 = Column(None, "t1", "c1") column2 = Column(None, "t1", "c2") function_1 = FunctionCall("alias", "f1", (column1, column2)) function_2 = FunctionCall("alias", "f2", (column2, )) condition = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "1")) prewhere = binary_condition(ConditionFunctions.EQ, function_1, Literal(None, "2")) orderby = OrderBy(OrderByDirection.ASC, function_2) query = Query( Table("my_table", ColumnSet([])), selected_columns=[SelectedExpression("alias", function_1)], array_join=None, condition=condition, groupby=[function_1], having=None, prewhere=prewhere, order_by=[orderby], ) def replace(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "f1": return FunctionCall(exp.alias, "tag", (Literal(None, "f1"), )) return exp query.transform_expressions(replace) expected_query = Query( Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression( "alias", FunctionCall("alias", "tag", (Literal(None, "f1"), ))) ], array_join=None, condition=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"), )), Literal(None, "1"), ), groupby=[FunctionCall("alias", "tag", (Literal(None, "f1"), ))], prewhere=binary_condition( ConditionFunctions.EQ, FunctionCall("alias", "tag", (Literal(None, "f1"), )), Literal(None, "2"), ), having=None, order_by=[orderby], ) assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( ) assert list(query.get_all_expressions()) == list( expected_query.get_all_expressions())