def detect_table(query: Query, events_only_columns: ColumnSet, transactions_only_columns: ColumnSet) -> str: """ Given a query, we attempt to guess whether it is better to fetch data from the "events" or "transactions" storage. This is going to be wrong in some cases. """ # First check for a top level condition that matches either type = transaction # type != transaction. conditions = query.get_conditions() if conditions: for idx, condition in enumerate(conditions): if is_condition(condition): if tuple(condition) == ("type", "=", "error"): return EVENTS elif tuple(condition) == ("type", "=", "transaction"): return TRANSACTIONS # Check for any conditions that reference a table specific field condition_columns = query.get_columns_referenced_in_conditions() if any(events_only_columns.get(col) for col in condition_columns): return EVENTS if any(transactions_only_columns.get(col) for col in condition_columns): return TRANSACTIONS # Check for any other references to a table specific field all_referenced_columns = query.get_all_referenced_columns() if any(events_only_columns.get(col) for col in all_referenced_columns): return EVENTS if any( transactions_only_columns.get(col) for col in all_referenced_columns): return TRANSACTIONS # Use events by default return EVENTS
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_column(exp: Expression) -> Expression: if isinstance(exp, Column): if exp.column_name == "group_id": return FunctionCall( exp.alias, "nullIf", ( Column(None, exp.column_name, exp.table_name), Literal(None, 0), ), ) elif exp.column_name == "message": # Because of the rename from message->search_message without backfill, # records will have one or the other of these fields. # TODO this can be removed once all data has search_message filled in. return FunctionCall( exp.alias, "coalesce", ( Column(None, exp.column_name, exp.table_name), Column(None, "search_message", exp.table_name), ), ) return exp query.transform_expressions(process_column)
def test_nested_aggregate_legacy_format(self, dataset): source = (dataset.get_all_storages() [0].get_schemas().get_read_schema().get_data_source()) priority = [ "toUInt64(plus(multiply(log(times_seen), 600), last_seen))", "", "priority", ] assert ( column_expr( dataset, "", Query({"aggregations": [priority]}, source), ParsingContext(), priority[2], priority[0], ) == "(toUInt64(plus(multiply(log(times_seen), 600), last_seen)) AS priority)" ) top_k = ["topK(3)", "logger", "top_3"] assert (column_expr( dataset, top_k[1], Query({"aggregations": [top_k]}, source), ParsingContext(), top_k[2], top_k[0], ) == "(topK(3)(logger) AS top_3)")
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_functions(exp: Expression) -> Expression: if isinstance(exp, FunctionCall): if exp.function_name == "uniq": return FunctionCall( exp.alias, "ifNull", ( replace(exp, alias=None), Literal(None, 0), ), ) if exp.function_name == "emptyIfNull": return FunctionCall( exp.alias, "ifNull", ( replace(exp, alias=None), Literal(None, ""), ), ) if isinstance(exp, CurriedFunctionCall): if exp.internal_function.function_name == "top": return replace( exp, internal_function=replace(exp.internal_function, function_name="topK"), ) return exp query.transform_expressions(process_functions)
def test_query_extension_processing( raw_data: dict, expected_conditions: Sequence[Condition], expected_granularity: int, ): state.set_config('max_days', 1) extension = TimeSeriesExtension( default_granularity=60, default_window=datetime.timedelta(days=5), timestamp_column='timestamp', ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query( {"conditions": []}, TableSource("my_table", ColumnSet([])), ) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions assert query.get_granularity() == expected_granularity
def process_query( self, query: Query, request_settings: RequestSettings, ) -> None: from_clause = query.get_data_source() if not isinstance(from_clause, JoinClause): return referenced_columns = query.get_all_referenced_columns() referenced_aliases = set() for qualified_column in referenced_columns: # This will be much better when we will represent columns # with a more structured data type than strings. match = QUALIFIED_COLUMN_REGEX.match(qualified_column) if match: # match[1] is the first parenthesized group in the regex, thus # the table alias. table_alias = match[1] referenced_aliases.add(table_alias) assert (len(referenced_aliases) > 0), "Trying to otpimize a join query without aliases" if len(referenced_aliases) > 1: return from_tables = from_clause.get_tables() table = from_tables[referenced_aliases.pop()] query.set_data_source(table)
def test_prewhere(initial_table, consistent, expected_table) -> None: state.set_config("enable_events_readonly_table", True) body = { "conditions": [ ["d", "=", "1"], ["c", "=", "3"], ["a", "=", "1"], ["b", "=", "2"], ], } cols = ColumnSet([("col", String())]) query = Query( body, TableSource(initial_table, cols, [["time", "=", "1"]], ["c1"]), ) request_settings = HTTPRequestSettings(consistent=consistent) processor = ReadOnlyTableSelector("sentry_dist", "sentry_ro") processor.process_query(query, request_settings) source = query.get_data_source() assert isinstance(source, TableSource) assert source.format_from() == expected_table assert source.get_columns() == cols assert source.get_prewhere_candidates() == ["c1"] assert source.get_mandatory_conditions() == [["time", "=", "1"]]
def process_query(self, query: Query, extension_data: ExtensionData) -> None: from_date, to_date = self.get_time_limit(extension_data) query.add_conditions([ (self.__timestamp_column, '>=', from_date.isoformat()), (self.__timestamp_column, '<', to_date.isoformat()), ])
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_functions(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "impact": assert len(exp.parameters) == 3 column = exp.parameters[0] satisfied = exp.parameters[1] user_column = exp.parameters[2] return plus( minus(Literal(None, 1), apdex(column, satisfied)), multiply( minus( Literal(None, 1), div( Literal(None, 1), FunctionCall( None, "sqrt", (FunctionCall(None, "uniq", user_column)), ), ), ), Literal(None, 3), ), ) return exp query.transform_expressions(process_functions)
def test_join_optimizer_two_tables( selected_cols: Sequence[Any], conditions: Sequence[Condition], groupby: Groupby, expected: str, ) -> None: query = Query( { "selected_columns": selected_cols, "conditions": conditions, "arrayjoin": None, "having": [], "groupby": groupby, "aggregations": [], "orderby": None, "limitby": None, "sample": 10, "limit": 100, "offset": 50, "totals": True, "granularity": 60, }, simple_join_structure, ) request_settings = HTTPRequestSettings() optimizer = SimpleJoinOptimizer() optimizer.process_query(query, request_settings) assert query.get_data_source().format_from() == expected
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: organization_id = extension_data["organization"] query.add_conditions([("org_id", "=", organization_id)])
def test_format_expressions(pre_format: Query, expected_query: Query) -> None: copy = deepcopy(pre_format) BasicFunctionsProcessor().process_query(copy, HTTPRequestSettings()) assert (copy.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert copy.get_groupby_from_ast() == expected_query.get_groupby_from_ast() assert copy.get_condition_from_ast( ) == expected_query.get_condition_from_ast()
def column_expr( self, column_name, query: Query, parsing_context: ParsingContext, table_alias: str = "", ): detected_dataset = detect_table(query, self.__events_columns, self.__transactions_columns) if detected_dataset == TRANSACTIONS: if column_name == "time": return self.time_expr("finish_ts", query.get_granularity(), table_alias) if column_name == "type": return "'transaction'" if column_name == "timestamp": return "finish_ts" if column_name == "username": return "user_name" if column_name == "email": return "user_email" if column_name == "transaction": return "transaction_name" if column_name == "message": return "transaction_name" if column_name == "title": return "transaction_name" if column_name == "group_id": # TODO: We return 0 here instead of NULL so conditions like group_id # in (1, 2, 3) will work, since Clickhouse won't run a query like: # SELECT (NULL AS group_id) FROM transactions WHERE group_id IN (1, 2, 3) # When we have the query AST, we should solve this by transforming the # nonsensical conditions instead. return "0" if column_name == "geo_country_code": column_name = "contexts[geo.country_code]" if column_name == "geo_region": column_name = "contexts[geo.region]" if column_name == "geo_city": column_name = "contexts[geo.city]" if self.__events_columns.get(column_name): return "NULL" else: if column_name == "time": return self.time_expr("timestamp", query.get_granularity(), table_alias) if column_name == "release": column_name = "tags[sentry:release]" if column_name == "dist": column_name = "tags[sentry:dist]" if column_name == "user": column_name = "tags[sentry:user]" if self.__transactions_columns.get(column_name): return "NULL" return get_dataset(detected_dataset).column_expr( column_name, query, parsing_context)
def test_col_replacement( initial_query: MutableMapping[str, Any], old_col: str, new_col: str, expected: Mapping[str, Any], ): query = Query(initial_query, TableSource("my_table", ColumnSet([]))) query.replace_column(old_col, new_col) assert expected == query.get_body()
def setup_method(self, test_method): super().setup_method(test_method) raw_data = {"project": 2} self.extension = ProjectExtension( processor=ProjectWithGroupsProcessor(project_column="project_id") ) self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema()) self.query = Query({"conditions": []}, TableSource("my_table", ColumnSet([])),)
def test_prewhere(query_body, keys, new_conditions, prewhere_conditions) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 query = Query(query_body, TableSource("my_table", ColumnSet([]), None, keys),) request_settings = HTTPRequestSettings() processor = PrewhereProcessor() processor.process_query(query, request_settings) assert query.get_conditions() == new_conditions assert query.get_prewhere() == prewhere_conditions
class TestProjectExtensionWithGroups(BaseTest): def setup_method(self, test_method): super().setup_method(test_method) raw_data = {'project': 2} self.extension = ProjectExtension( processor=ProjectWithGroupsProcessor() ) self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema()) self.query = Query({ "conditions": [] }) def test_with_turbo(self): request_settings = RequestSettings(turbo=True, consistent=False, debug=False) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] def test_without_turbo_with_projects_needing_final(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) replacer.set_project_needs_final(2) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] assert self.query.get_final() def test_without_turbo_without_projects_needing_final(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] assert not self.query.get_final() def test_when_there_are_not_many_groups_to_exclude(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) state.set_config('max_group_ids_exclude', 5) replacer.set_project_exclude_groups(2, [100, 101, 102]) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) expected = [ ('project_id', 'IN', [2]), (['assumeNotNull', ['group_id']], 'NOT IN', [100, 101, 102]) ] assert self.query.get_conditions() == expected assert not self.query.get_final() def test_when_there_are_too_many_groups_to_exclude(self): request_settings = RequestSettings(turbo=False, consistent=False, debug=False) state.set_config('max_group_ids_exclude', 2) replacer.set_project_exclude_groups(2, [100, 101, 102]) self.extension.get_processor().process_query(self.query, self.valid_data, request_settings) assert self.query.get_conditions() == [('project_id', 'IN', [2])] assert self.query.get_final()
def test_conditions_expr(): dataset = get_dataset("groups") state.set_config('use_escape_alias', 1) conditions = [['events.a', '=', 1]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) == '(events.a AS `events.a`) = 1' conditions = [[['events.a', '=', 1], ['groups.b', '=', 2]], [['events.c', '=', 3], ['groups.d', '=', 4]]] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == ('((events.a AS `events.a`) = 1 OR (groups.b AS `groups.b`) = 2)' ' AND ((events.c AS `events.c`) = 3 OR (groups.d AS `groups.d`) = 4)' ) # Test column expansion conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]] expanded = column_expr(dataset, 'events.tags[foo]', Query({}), ParsingContext()) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == '({} = 1 OR (groups.b AS `groups.b`) = 2)'.format(expanded) # Test using alias if column has already been expanded in SELECT clause reuse_query = Query({}) parsing_context = ParsingContext() conditions = [[['events.tags[foo]', '=', 1], ['groups.b', '=', 2]]] column_expr(dataset, 'events.tags[foo]', reuse_query, parsing_context) # Expand it once so the next time is aliased assert conditions_expr(dataset, conditions, reuse_query, parsing_context) \ == '(`events.tags[foo]` = 1 OR (groups.b AS `groups.b`) = 2)' # Test special output format of LIKE conditions = [['events.primary_hash', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == '(events.primary_hash AS `events.primary_hash`) LIKE \'%foo%\'' conditions = tuplify( [[['notEmpty', ['arrayElement', ['events.exception_stacks.type', 1]]], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty(arrayElement((events.exception_stacks.type AS `events.exception_stacks.type`), 1)) = 1' conditions = tuplify([[['notEmpty', ['events.tags[sentry:user]']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty(`events.tags[sentry:user]`) = 1' conditions = tuplify([[['notEmpty', ['events.tags_key']], '=', 1]]) assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'notEmpty((arrayJoin(events.tags.key) AS `events.tags_key`)) = 1' # Test scalar condition on array column is expanded as an iterator. conditions = [['events.exception_frames.filename', 'LIKE', '%foo%']] assert conditions_expr(dataset, conditions, Query({}), ParsingContext()) \ == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), (events.exception_frames.filename AS `events.exception_frames.filename`))'
def test_empty_query(): query = Query({}) assert query.get_selected_columns() is None assert query.get_aggregations() is None assert query.get_groupby() is None assert query.get_conditions() is None assert query.get_orderby() is None assert query.get_sample() is None assert query.get_limit() == 0 assert query.get_offset() == 0
def setup_method(self, test_method): super().setup_method(test_method) raw_data = {'project': 2} self.extension = ProjectExtension( processor=ProjectWithGroupsProcessor() ) self.valid_data = validate_jsonschema(raw_data, self.extension.get_schema()) self.query = Query({ "conditions": [] })
def process_query(self, query: Query, request_settings: RequestSettings) -> None: def process_functions(exp: Expression) -> Expression: if isinstance(exp, FunctionCall) and exp.function_name == "apdex": assert len(exp.parameters) == 2 column = exp.parameters[0] satisfied = exp.parameters[1] return apdex(column, satisfied) return exp query.transform_expressions(process_functions)
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: from_date, to_date = self.get_time_limit(extension_data) query.set_granularity(extension_data["granularity"]) query.add_conditions([ (self.__timestamp_column, '>=', from_date.isoformat()), (self.__timestamp_column, '<', to_date.isoformat()), ])
def test_query_extension_processing(raw_data: dict, expected_conditions: Sequence[Condition]): state.set_config('max_days', 1) extension = TimeSeriesExtension( default_granularity=3600, default_window=datetime.timedelta(days=5), timestamp_column='timestamp', ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}) extension.get_processor().process_query(query, valid_data) assert query.get_conditions() == expected_conditions
def test_project_extension_query_processing(raw_data: dict, expected_conditions: Sequence[Condition]): extension = ProjectExtension( processor=ProjectExtensionProcessor() ) valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({ "conditions": [] }) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == expected_conditions
def test_data_source( self, query_body: MutableMapping[str, Any], expected_dataset: str ): query = Query(query_body, get_dataset_source("discover")) request_settings = HTTPRequestSettings() for processor in get_dataset("discover").get_query_processors(): processor.process_query(query, request_settings) assert ( query.get_data_source().format_from() == get_dataset_source(expected_dataset).format_from() )
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: project_ids = util.to_list(extension_data['project']) if project_ids: query.add_conditions([('project_id', 'IN', project_ids)]) request_settings.add_rate_limit(self._get_rate_limit_params(project_ids)) self.do_post_processing(project_ids, query, request_settings)
def do_post_processing( self, project_ids: Sequence[int], query: Query, request_settings: RequestSettings, ) -> None: if not request_settings.get_turbo(): final, exclude_group_ids = get_projects_query_flags( project_ids, self.__replacer_state_name) if not final and exclude_group_ids: # If the number of groups to exclude exceeds our limit, the query # should just use final instead of the exclusion set. max_group_ids_exclude = get_config( "max_group_ids_exclude", settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE) if len(exclude_group_ids) > max_group_ids_exclude: query.set_final(True) else: query.add_conditions([(["assumeNotNull", ["group_id"]], "NOT IN", exclude_group_ids)]) query.add_condition_to_ast( not_in_condition( None, FunctionCall(None, "assumeNotNull", (Column(None, "group_id", None), )), [Literal(None, p) for p in exclude_group_ids], )) else: query.set_final(final)
def test_organization_extension_query_processing_happy_path(): extension = OrganizationExtension() raw_data = {"organization": 2} valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) extension.get_processor().process_query(query, valid_data, request_settings) assert query.get_conditions() == [("org_id", "=", 2)]
def process_query( self, query: Query, extension_data: ExtensionData, request_settings: RequestSettings, ) -> None: organization_id = extension_data["organization"] query.add_conditions([("org_id", "=", organization_id)]) query.add_condition_to_ast( binary_condition( None, ConditionFunctions.EQ, Column(None, "org_id", None), Literal(None, organization_id), ))
def test_project_extension_query_adds_rate_limits(): extension = ProjectExtension(processor=ProjectExtensionProcessor( project_column="project_id")) raw_data = {'project': [2, 3]} valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query( {"conditions": []}, TableSource("my_table", ColumnSet([])), ) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) num_rate_limits_before_processing = len( request_settings.get_rate_limit_params()) extension.get_processor().process_query(query, valid_data, request_settings) rate_limits = request_settings.get_rate_limit_params() # make sure a rate limit was added by the processing assert len(rate_limits) == num_rate_limits_before_processing + 1 most_recent_rate_limit = rate_limits[-1] assert most_recent_rate_limit.bucket == '2' assert most_recent_rate_limit.per_second_limit == 1000 assert most_recent_rate_limit.concurrent_limit == 1000