def _validate_arrayjoin(query: Query) -> None: # TODO: Actually validate arrayjoin. For now log how it is used. body_arrayjoin = "" arrayjoin = query.get_arrayjoin_from_ast() if arrayjoin is not None: if isinstance(arrayjoin, Column): body_arrayjoin = arrayjoin.column_name array_joins = set() if body_arrayjoin: array_joins.add(body_arrayjoin) for exp in query.get_all_expressions(): match = ARRAYJOIN_FUNCTION_MATCH.match(exp) if match is not None: if isinstance(exp, Column): array_joins.add(exp.column_name) else: array_joins.add(f"{type(exp)}") if len(array_joins) > 0: join_type = "body" if body_arrayjoin else "function" suffix = "gt1" if len(array_joins) > 1 else "eq1" key = f"arrayjoin.{join_type}.{suffix}" metrics.increment( key, tags={"arrayjoin": ",".join(array_joins)}, )
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def apply_function(expression: Expression) -> Expression: if (isinstance(expression, FunctionCall) and expression.function_name == self.__function_name): try: self.__validator.validate(expression.parameters, self.__dataset_schema) except InvalidFunctionCall as exception: raise InvalidCustomFunctionCall( expression, f"Illegal call to function {expression.function_name}: {str(exception)}", ) from exception resolved_params = { name: expression for (name, expression ) in zip(self.__param_names, expression.parameters) } ret = replace_in_expression(self.__body, resolved_params) return replace(ret, alias=expression.alias) else: return expression query.transform_expressions(apply_function)
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: from_date, to_date = self.get_time_limit(extension_data) query.set_granularity(extension_data["granularity"]) query.add_condition_to_ast( binary_condition( BooleanFunctions.AND, binary_condition( ConditionFunctions.GTE, Column( f"_snuba_{self.__timestamp_column}", None, self.__timestamp_column, ), Literal(None, from_date), ), binary_condition( ConditionFunctions.LT, Column( f"_snuba_{self.__timestamp_column}", None, self.__timestamp_column, ), Literal(None, to_date), ), ))
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_functions(exp: Expression) -> Expression: if isinstance(exp, FunctionCall): if exp.function_name == "uniq": return FunctionCall( exp.alias, "ifNull", ( replace(exp, alias=None), Literal(None, 0), ), ) if exp.function_name == "emptyIfNull": return FunctionCall( exp.alias, "ifNull", ( replace(exp, alias=None), Literal(None, ""), ), ) if isinstance(exp, CurriedFunctionCall): if exp.internal_function.function_name == "top": return replace( exp, internal_function=replace(exp.internal_function, function_name="topK"), ) return exp query.transform_expressions(process_functions)
def test_join_optimizer_two_tables( selected_cols: Sequence[Any], conditions: Sequence[Condition], groupby: Groupby, expected: str, ) -> None: query = Query( { "selected_columns": selected_cols, "conditions": conditions, "arrayjoin": None, "having": [], "groupby": groupby, "aggregations": [], "orderby": None, "limitby": None, "sample": 10, "limit": 100, "offset": 50, "totals": True, "granularity": 60, }, simple_join_structure, ) request_settings = HTTPRequestSettings() optimizer = SimpleJoinOptimizer() optimizer.process_query(query, request_settings) assert query.get_data_source().format_from() == expected
def _deescape_aliases(query: Query) -> None: """ The legacy query processing does not escape user declared aliases thus aliases like project.name would make the query fail. So Sentry started defining pre-escaped aliases like `project.name` to go around the problem. The AST processing properly escapes aliases thus causing double escaping. We need to de-escape them in the AST query to preserve backward compatibility as long as the legacy query processing is around. """ def deescape(expression: Optional[str]) -> Optional[str]: if expression is not None: match = DEESCAPER_RE.match(expression) if match: return match[1] return expression query.transform_expressions( lambda expr: replace(expr, alias=deescape(expr.alias))) query.set_ast_selected_columns([ replace(s, name=deescape(s.name)) for s in query.get_selected_columns_from_ast() or [] ])
def _parse_subscriptables(query: Query) -> None: """ Turns columns formatted as tags[asd] into SubscriptableReference. """ current_aliases = { exp.alias for exp in query.get_all_expressions() if exp.alias } def transform(exp: Expression) -> Expression: if not isinstance(exp, Column) or exp.column_name in current_aliases: return exp match = NESTED_COL_EXPR_RE.match(exp.column_name) if match is None: # This is not a tag[asd] column. return exp col_name = match[1] key_name = match[2] return SubscriptableReference( alias=exp.column_name, column=Column(None, None, col_name), key=Literal(None, key_name), ) query.transform_expressions(transform)
def test_format_expressions(pre_format: Query, expected_query: Query) -> None: copy = deepcopy(pre_format) BasicFunctionsProcessor().process_query(copy, HTTPRequestSettings()) assert (copy.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert copy.get_groupby_from_ast() == expected_query.get_groupby_from_ast() assert copy.get_condition_from_ast( ) == expected_query.get_condition_from_ast()
def __get_filter_tags(self, query: Query) -> List[str]: """ Identifies the tag names we can apply the arrayFilter optimization on. Which means: if the tags_key column is in the select clause and there are one or more top level conditions on the tags_key column. We can only apply the arrayFilter optimization to tag keys conditions that are not in OR with other columns. To simplify the problem, we only consider those conditions that are included in the first level of the query: [['tagskey' '=' 'a'],['col' '=' 'b'],['col2' '=' 'c']] works [[['tagskey' '=' 'a'], ['col2' '=' 'b']], ['tagskey' '=' 'c']] does not """ if not state.get_config("ast_tag_processor_enabled", 1): return [] tags_key_found = any( "tags_key" in columns_in_expr(expression) for expression in query.get_selected_columns() or [] ) if not tags_key_found: return [] def extract_tags_from_condition( cond: Sequence[Condition], ) -> Optional[List[str]]: if not cond: return [] ret = [] for c in cond: if not is_condition(c): # This is an OR return None if c[1] == "=" and c[0] == "tags_key" and isinstance(c[2], str): ret.append(str(c[2])) elif ( c[1] == "IN" and c[0] == "tags_key" and isinstance(c[2], (list, tuple)) ): ret.extend([str(tag) for tag in c[2]]) return ret cond_tags_key = extract_tags_from_condition(query.get_conditions() or []) if cond_tags_key is None: # This means we found an OR. Cowardly we give up even though there could # be cases where this condition is still optimizable. return [] having_tags_key = extract_tags_from_condition(query.get_having() or []) if having_tags_key is None: # Same as above return [] return cond_tags_key + having_tags_key
def process_query(self, query: Query, request_settings: RequestSettings) -> None: mandatory_conditions = query.get_from_clause( ).get_mandatory_conditions() if len(mandatory_conditions) > 0: query.add_condition_to_ast( combine_and_conditions(mandatory_conditions))
def process_query(self, query: Query, query_settings: QuerySettings) -> None: granularity = self.__get_granularity(query) query.add_condition_to_ast( binary_condition( ConditionFunctions.EQ, Column(None, None, "granularity"), Literal(None, granularity), ))
def test_col_replacement( initial_query: MutableMapping[str, Any], old_col: str, new_col: str, expected: Mapping[str, Any], ): query = Query(initial_query, TableSource("my_table", ColumnSet([]))) query.replace_column(old_col, new_col) assert expected == query.get_body()
def track_bad_query( query: Query, selected_entity: EntityKey, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet, ) -> None: event_columns = set() transaction_columns = set() for col in query.get_all_ast_referenced_columns(): if events_only_columns.get(col.column_name): event_columns.add(col.column_name) elif transactions_only_columns.get(col.column_name): transaction_columns.add(col.column_name) for subscript in query.get_all_ast_referenced_subscripts(): schema_col_name = subscript_key_column_name(subscript) if events_only_columns.get(schema_col_name): event_columns.add(schema_col_name) if transactions_only_columns.get(schema_col_name): transaction_columns.add(schema_col_name) event_mismatch = event_columns and selected_entity == TRANSACTIONS transaction_mismatch = transaction_columns and selected_entity in [ EVENTS, EVENTS_AND_TRANSACTIONS, ] if event_mismatch or transaction_mismatch: missing_columns = ",".join( sorted(event_columns if event_mismatch else transaction_columns)) selected_entity_str = (str(selected_entity.value) if isinstance( selected_entity, EntityKey) else selected_entity) metrics.increment( "query.impossible", tags={ "selected_table": selected_entity_str, "missing_columns": missing_columns, }, ) if selected_entity == EVENTS_AND_TRANSACTIONS and (event_columns or transaction_columns): # Not possible in future with merge table missing_events_columns = ",".join(sorted(event_columns)) missing_transactions_columns = ",".join(sorted(transaction_columns)) metrics.increment( "query.impossible-merge-table", tags={ "missing_events_columns": missing_events_columns, "missing_transactions_columns": missing_transactions_columns, }, ) else: metrics.increment("query.success")
def process_query(self, query: Query, request_settings: RequestSettings) -> None: mandatory_conditions = query.get_data_source( ).get_mandatory_conditions() query.add_conditions([c.legacy for c in mandatory_conditions]) if len(mandatory_conditions) > 0: query.add_condition_to_ast( combine_and_conditions([c.ast for c in mandatory_conditions]))
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def apply_matcher(expression: Expression) -> Expression: result = self.__matcher.match(expression) if result is not None: ret = self.__transformation_fn(result, expression) return replace(ret, alias=expression.alias) return expression query.transform_expressions(apply_matcher)
def test_project_extension_query_processing( raw_data: Mapping[str, Any], expected_conditions: Sequence[Condition], expected_ast_conditions: Expression, ) -> None: extension = ProjectExtension(project_column="project_id") valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, QueryEntity(EntityKey.EVENTS, ColumnSet([]))) request_settings = HTTPRequestSettings() extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_condition_from_ast() == expected_ast_conditions
def test_timeseries_format_expressions( granularity: int, condition: Optional[FunctionCall], exp_column: FunctionCall, exp_condition: Optional[FunctionCall], formatted_column: str, formatted_condition: str, ) -> None: unprocessed = Query( QueryEntity(EntityKey.EVENTS, ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression("my_time", Column("my_time", None, "time")), ], condition=condition, groupby=[Column("my_time", None, "time")], granularity=granularity, ) expected = Query( QueryEntity(EntityKey.EVENTS, ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression(exp_column.alias, exp_column), ], condition=exp_condition, ) entity = TransactionsEntity() processors = entity.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(unprocessed, HTTPRequestSettings()) assert expected.get_selected_columns() == unprocessed.get_selected_columns( ) assert expected.get_condition() == unprocessed.get_condition() ret = unprocessed.get_selected_columns()[1].expression.accept( ClickhouseExpressionFormatter()) assert ret == formatted_column if condition: query_condition = unprocessed.get_condition() assert query_condition is not None ret = query_condition.accept(ClickhouseExpressionFormatter()) assert formatted_condition == ret assert extract_granularity_from_query(unprocessed, "finish_ts") == granularity
def test_organization_extension_query_processing_happy_path(): extension = OrganizationExtension() raw_data = {"organization": 2} valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, TableSource("my_table", ColumnSet([]))) request_settings = HTTPRequestSettings() extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_condition_from_ast() == binary_condition( None, ConditionFunctions.EQ, Column(None, None, "org_id"), Literal(None, 2))
def test_pattern_replacer_format_expressions( unprocessed: Query, expected: Query ) -> None: def transform(match: MatchResult, exp: Expression) -> Expression: assert isinstance(exp, Column) # mypy return FunctionCall( None, "nullIf", (Column(None, None, exp.column_name), Literal(None, ""),) ) PatternReplacer( Param("column", ColumnMatch(None, StringMatch("column1"))), transform, ).process_query(unprocessed, HTTPRequestSettings()) assert expected.get_selected_columns() == unprocessed.get_selected_columns()
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: organization_id = extension_data["organization"] query.add_condition_to_ast( binary_condition( ConditionFunctions.EQ, Column("_snuba_org_id", None, "org_id"), Literal(None, organization_id), ))
def test_format_expressions(query_body: str, expected_query: Query) -> None: state.set_config("query_parsing_expand_aliases", 1) events = get_dataset("events") query = parse_snql_query(query_body, events) assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( ) assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_having_from_ast() == expected_query.get_having_from_ast()
def test_project_extension_query_processing( raw_data: dict, expected_conditions: Sequence[Condition], expected_ast_conditions: Expression, ): extension = ProjectExtension(project_column="project_id") valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, TableSource("my_table", ColumnSet([])),) request_settings = HTTPRequestSettings() extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions assert query.get_condition_from_ast() == expected_ast_conditions
def test_organization_extension_query_processing_happy_path() -> None: extension = OrganizationExtension() raw_data = {"organization": 2} valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, QueryEntity(EntityKey.EVENTS, ColumnSet([]))) request_settings = HTTPRequestSettings() extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_condition_from_ast() == binary_condition( ConditionFunctions.EQ, Column("_snuba_org_id", None, "org_id"), Literal(None, 2))
def test_timeseries_format_expressions( granularity: int, condition: Optional[FunctionCall], exp_column: FunctionCall, exp_condition: Optional[FunctionCall], formatted_column: str, formatted_condition: str, ) -> None: unprocessed = Query( {}, TableSource("transactions", ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression("my_time", Column("my_time", None, "time")), ], condition=condition, granularity=granularity, ) expected = Query( {"granularity": granularity}, TableSource("transactions", ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression(exp_column.alias, exp_column), ], condition=exp_condition, ) entity = TransactionsEntity() processors = entity.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(unprocessed, HTTPRequestSettings()) assert (expected.get_selected_columns_from_ast() == unprocessed.get_selected_columns_from_ast()) assert expected.get_condition_from_ast( ) == unprocessed.get_condition_from_ast() ret = unprocessed.get_selected_columns_from_ast()[1].expression.accept( ClickhouseExpressionFormatter()) assert ret == formatted_column if condition: ret = unprocessed.get_condition_from_ast().accept( ClickhouseExpressionFormatter()) assert formatted_condition == ret
def test_format_clickhouse_specific_query() -> None: """ Adds a few of the Clickhosue specific fields to the query. """ query = Query( { "sample": 0.1, "totals": True, "limitby": (10, "environment") }, TableSource("my_table", ColumnSet([])), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")), SelectedExpression("column2", Column(None, "table1", "column2")), ], condition=binary_condition( None, "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, "blabla"), ), groupby=[ Column(None, None, "column1"), Column(None, "table1", "column2") ], having=binary_condition( None, "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, 123), ), order_by=[ OrderBy(OrderByDirection.ASC, Column(None, None, "column1")) ], array_join=Column(None, None, "column1"), ) query.set_final(True) query.set_offset(50) query.set_limit(100) request_settings = HTTPRequestSettings() clickhouse_query = AstSqlQuery(query, request_settings) expected = { "from": "FROM my_table FINAL SAMPLE 0.1", "group": "GROUP BY (column1, table1.column2) WITH TOTALS", "having": "HAVING eq(column1, 123)", "array_join": "ARRAY JOIN column1", "limit": "LIMIT 100 OFFSET 50", "limitby": "LIMIT 10 BY environment", "order": "ORDER BY column1 ASC", "select": "SELECT column1, table1.column2", "where": "WHERE eq(column1, 'blabla')", } assert clickhouse_query.sql_data() == expected
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_column(exp: Expression) -> Expression: if isinstance(exp, Column): if exp.column_name in self.__time_columns: real_column_name = self.__time_columns[exp.column_name] granularity = query.get_granularity() if granularity is None: granularity = 3600 time_column_fn = self.time_expr(real_column_name, granularity, exp.alias) return time_column_fn return exp query.transform_expressions(process_column)
def _validate_aliases(query: Query) -> None: """ Ensures that no alias has been defined multiple times for different expressions in the query. Thus rejecting queries with shadowing. """ all_declared_aliases: MutableMapping[str, Expression] = {} for exp in query.get_all_expressions(): if exp.alias is not None: if exp.alias == "": # TODO: Enforce this in the parser when we are sure it is not # happening. metrics.increment("empty_alias") if ( exp.alias in all_declared_aliases and exp != all_declared_aliases[exp.alias] ): raise AliasShadowingException( ( f"Shadowing aliases detected for alias: {exp.alias}. " + f"Expressions: {all_declared_aliases[exp.alias]}" ) ) else: all_declared_aliases[exp.alias] = exp
def test_apply_quota( enabled: int, referrer: str, config_to_set: str, expected_quota: Optional[ResourceQuota], ) -> None: state.set_config(ENABLED_CONFIG, enabled) state.set_config(config_to_set, 5) query = Query( QueryEntity(EntityKey.EVENTS, EntityColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=binary_condition( ConditionFunctions.EQ, Column("_snuba_project_id", None, "project_id"), Literal(None, 1), ), ) settings = HTTPQuerySettings() settings.referrer = referrer ResourceQuotaProcessor("project_id").process_query(query, settings) assert settings.get_resource_quota() == expected_quota
def build_and_rank_plans( self, query: LogicalQuery, settings: RequestSettings ) -> Sequence[ClickhouseQueryPlan]: with sentry_sdk.start_span( op="build_plan.selected_storage", description="select_storage" ): storage, mappers = self.__selector.select_storage(query, settings) with sentry_sdk.start_span( op="build_plan.selected_storage", description="translate" ): # The QueryTranslator class should be instantiated once for each call to build_plan, # to avoid cache conflicts. clickhouse_query = QueryTranslator(mappers).translate(query) with sentry_sdk.start_span( op="build_plan.selected_storage", description="set_from_clause" ): clickhouse_query.set_from_clause( get_query_data_source( storage.get_schema().get_data_source(), final=query.get_final(), sampling_rate=query.get_sample(), ) ) cluster = storage.get_cluster() db_query_processors = [ *storage.get_query_processors(), *self.__post_processors, MandatoryConditionApplier(), ] return [ ClickhouseQueryPlan( query=clickhouse_query, plan_query_processors=[], db_query_processors=db_query_processors, storage_set_key=storage.get_storage_set_key(), execution_strategy=SimpleQueryPlanExecutionStrategy( cluster=cluster, db_query_processors=db_query_processors, splitters=storage.get_query_splitters(), ), ) ]
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: project_ids = util.to_list(extension_data["project"]) if project_ids: query.add_condition_to_ast( in_condition( Column(None, None, self.__project_column), [Literal(None, p) for p in project_ids], )) request_settings.add_rate_limit( self._get_rate_limit_params(project_ids))