def test_table_rate_limit(query: Query, limit_to_set: str, params: RateLimitParameters) -> None: set_config(limit_to_set, 50) request_settings = HTTPRequestSettings(consistent=True) TableRateLimit().process_query(query, request_settings) rate_limiters = request_settings.get_rate_limit_params() assert params in rate_limiters
def test_col_split_conditions( id_column: str, project_column: str, timestamp_column: str, query, expected_result ) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) plan = entity.get_query_plan_builder().build_plan(request) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: return QueryResult( { "data": [ { id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", } ] }, {}, ) assert ( splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None ) == expected_result
def test_composite_planner( logical_query: CompositeQuery[Entity], composite_plan: CompositeQueryPlan, processed_query: CompositeQuery[Table], ) -> None: def assert_subquery_processors_equality( query: SubqueryProcessors, expected: SubqueryProcessors) -> None: assert [type(x) for x in query.plan_processors ] == [type(x) for x in expected.plan_processors] assert [type(x) for x in query.db_processors ] == [type(x) for x in expected.db_processors] plan = CompositeQueryPlanner(deepcopy(logical_query), HTTPRequestSettings()).build_best_plan() report = plan.query.equals(composite_plan.query) assert report[0], f"Mismatch: {report[1]}" # We cannot simply check the equality between the plans because # we need to verify processors are of the same type, they can # be different instances, thus making the simple equality fail. query_processors = plan.root_processors is not None expected_processors = composite_plan.root_processors is not None assert query_processors == expected_processors if plan.root_processors is not None and composite_plan.root_processors is not None: assert_subquery_processors_equality( plan.root_processors, composite_plan.root_processors, ) query_alias_processors = plan.aliased_processors is not None expected_alias_processors = composite_plan.aliased_processors is not None assert query_alias_processors == expected_alias_processors if (plan.aliased_processors is not None and composite_plan.aliased_processors is not None): assert len(plan.aliased_processors) == len( composite_plan.aliased_processors) for k in plan.aliased_processors: assert_subquery_processors_equality( plan.aliased_processors[k], composite_plan.aliased_processors[k], ) def runner( query: Union[ClickhouseQuery, CompositeQuery[Table]], request_settings: RequestSettings, reader: Reader, ) -> QueryResult: report = query.equals(processed_query) assert report[0], f"Mismatch: {report[1]}" return QueryResult( {"data": []}, {}, ) CompositeExecutionPipeline(logical_query, HTTPRequestSettings(), runner).execute()
def test_storage_selector() -> None: state.set_config("enable_events_readonly_table", True) storage = get_storage(StorageKey.ERRORS) storage_ro = get_storage(StorageKey.ERRORS_RO) query = Query(Entity(EntityKey.EVENTS, ColumnSet([])), selected_columns=[]) storage_selector = ErrorsQueryStorageSelector(mappers=errors_translators) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=False)).storage == storage_ro) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=True)).storage == storage)
def test_storage_selector() -> None: state.set_config("enable_events_readonly_table", True) storage = get_storage(StorageKey.EVENTS) storage_ro = get_storage(StorageKey.EVENTS_RO) query = Query({}, storage.get_schema().get_data_source()) storage_selector = EventsQueryStorageSelector(storage, storage_ro) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=False)).storage == storage_ro) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=True)).storage == storage)
def test_type_condition_optimizer() -> None: cond1 = binary_condition(ConditionFunctions.EQ, Column(None, None, "col1"), Literal(None, "val1")) unprocessed_query = Query( Table("errors", ColumnSet([])), condition=binary_condition( BooleanFunctions.AND, binary_condition( ConditionFunctions.NEQ, Column(None, None, "type"), Literal(None, "transaction"), ), cond1, ), ) expected_query = Query( Table("errors", ColumnSet([])), condition=binary_condition(BooleanFunctions.AND, Literal(None, 1), cond1), ) TypeConditionOptimizer().process_query(unprocessed_query, HTTPRequestSettings()) assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == "1 AND equals(col1, 'val1')"
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def test() -> None: cv = threading.Condition() query_result = QueryResult({}, {"stats": {}, "sql": ""}) mock_query_runner = Mock(return_value=query_result) def callback_func(args: List[Tuple[str, QueryResult]]) -> None: with cv: cv.notify() mock_callback = Mock(side_effect=callback_func) query_body = { "selected_columns": ["type", "project_id"], } events = get_dataset("events") query = parse_query(query_body, events) events_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.EVENTS)), ) errors_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.ERRORS)), ) delegator = PipelineDelegator( query_pipeline_builders={ "events": events_pipeline, "errors": errors_pipeline }, selector_func=lambda query, referrer: ("events", ["errors"]), callback_func=mock_callback, ) with cv: request_settings = HTTPRequestSettings() delegator.build_execution_pipeline( Request( "", query_body, query, request_settings, "ref", ), mock_query_runner, ).execute() cv.wait(timeout=5) assert mock_query_runner.call_count == 2 assert mock_callback.call_args == call( query, request_settings, "ref", [ Result("events", query_result, ANY), Result("errors", query_result, ANY) ], )
def test_timeseries_column_format_expressions(granularity: int, ast_value: FunctionCall, formatted_value: str) -> None: unprocessed = Query( {"granularity": granularity}, TableSource("transactions", ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression("my_time", Column("my_time", None, "time")), ], ) expected = Query( {"granularity": granularity}, TableSource("transactions", ColumnSet([])), selected_columns=[ SelectedExpression( "transaction.duration", Column("transaction.duration", None, "duration")), SelectedExpression(ast_value.alias, ast_value), ], ) dataset = TransactionsDataset() for processor in dataset.get_query_processors(): if isinstance(processor, TimeSeriesColumnProcessor): processor.process_query(unprocessed, HTTPRequestSettings()) assert (expected.get_selected_columns_from_ast() == unprocessed.get_selected_columns_from_ast()) ret = unprocessed.get_selected_columns_from_ast()[1].expression.accept( ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_hexint_column_processor(unprocessed: Expression, formatted_value: str) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[SelectedExpression("column1", Column(None, None, "column1"))], condition=unprocessed, ) HexIntColumnProcessor(set(["column1"])).process_query( unprocessed_query, HTTPRequestSettings() ) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column1", FunctionCall( None, "lower", (FunctionCall(None, "hex", (Column(None, None, "column1"),),),), ), ) ] condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_prewhere(initial_table, consistent, expected_table) -> None: state.set_config("enable_events_readonly_table", True) body = { "conditions": [ ["d", "=", "1"], ["c", "=", "3"], ["a", "=", "1"], ["b", "=", "2"], ], } cols = ColumnSet([("col", String())]) query = Query( body, TableSource(initial_table, cols, [["time", "=", "1"]], ["c1"]), ) request_settings = HTTPRequestSettings(consistent=consistent) processor = ReadOnlyTableSelector("sentry_dist", "sentry_ro") processor.process_query(query, request_settings) source = query.get_data_source() assert isinstance(source, TableSource) assert source.format_from() == expected_table assert source.get_columns() == cols assert source.get_prewhere_candidates() == ["c1"] assert source.get_mandatory_conditions() == [["time", "=", "1"]]
def test_uuid_array_column_processor( unprocessed: Expression, expected: Expression, formatted_value: str, ) -> None: unprocessed_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=unprocessed, ) expected_query = Query( Table("transactions", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column(None, None, "column2")) ], condition=expected, ) FixedStringArrayColumnProcessor(set(["column1", "column2"]), 32).process_query(unprocessed_query, HTTPRequestSettings()) assert unprocessed_query.get_selected_columns() == [ SelectedExpression( "column2", Column(None, None, "column2"), ) ] assert expected_query.get_condition() == unprocessed_query.get_condition() condition = unprocessed_query.get_condition() assert condition is not None ret = condition.accept(ClickhouseExpressionFormatter()) assert ret == formatted_value
def test_timeseries_column_format_expressions( granularity, ast_value, formatted_value ) -> None: unprocessed = Query( {"granularity": granularity}, TableSource("transactions", ColumnSet([])), selected_columns=[ Column("transaction.duration", "duration", None), Column("my_start", "bucketed_start", None), ], ) expected = Query( {"granularity": granularity}, TableSource("transactions", ColumnSet([])), selected_columns=[Column("transaction.duration", "duration", None), ast_value,], ) dataset = TransactionsDataset() TimeSeriesColumnProcessor( dataset._TimeSeriesDataset__time_group_columns ).process_query(unprocessed, HTTPRequestSettings()) assert ( expected.get_selected_columns_from_ast() == unprocessed.get_selected_columns_from_ast() ) ret = unprocessed.get_selected_columns_from_ast()[1].accept( ClickhouseExpressionFormatter() ) assert ret == formatted_value
def test_without_turbo_without_projects_needing_final(query: ClickhouseQuery) -> None: PostReplacementConsistencyEnforcer("project_id", None).process_query( query, HTTPRequestSettings() ) assert query.get_condition_from_ast() == build_in("project_id", [2]) assert not query.get_from_clause().final
def test_mand_conditions(table: str, mand_conditions: List[FunctionCall]) -> None: query = Query( Table( table, ColumnSet([]), final=False, sampling_rate=None, mandatory_conditions=mand_conditions, ), None, None, binary_condition( BooleanFunctions.AND, binary_condition( OPERATOR_TO_FUNCTION["="], Column("d", None, "d"), Literal(None, "1"), ), binary_condition( OPERATOR_TO_FUNCTION["="], Column("c", None, "c"), Literal(None, "3"), ), ), ) query_ast_copy = copy.deepcopy(query) request_settings = HTTPRequestSettings(consistent=True) processor = MandatoryConditionApplier() processor.process_query(query, request_settings) query_ast_copy.add_condition_to_ast(combine_and_conditions(mand_conditions)) assert query.get_condition_from_ast() == query_ast_copy.get_condition_from_ast()
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ # Cannot test complex conditions based on explicit calls # the `and` and `or` functions, because they would not be # parsed as datetime by the old parser. ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_default_entity().get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(identity_translate(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def test_join_optimizer_two_tables( selected_cols: Sequence[Any], conditions: Sequence[Condition], groupby: Groupby, expected: str, ) -> None: query = Query( { "selected_columns": selected_cols, "conditions": conditions, "arrayjoin": None, "having": [], "groupby": groupby, "aggregations": [], "orderby": None, "limitby": None, "sample": 10, "limit": 100, "offset": 50, "totals": True, "granularity": 60, }, simple_join_structure, ) request_settings = HTTPRequestSettings() optimizer = SimpleJoinOptimizer() optimizer.process_query(query, request_settings) assert query.get_data_source().format_from() == expected
def test_format_expressions(query: Query, expected_query: Query) -> None: processor = CustomFunction( ColumnSet([("param1", String()), ("param2", UInt(8)), ("other_col", String())]), "f_call", [("param1", ColType({String})), ("param2", ColType({UInt}))], partial_function( "f_call_impl(param1, inner_call(param2), my_const)", [("my_const", 420)], ), ) # We cannot just run == on the query objects. The content of the two # objects is different, being one the AST and the ont the AST + raw body processor.process_query(query, HTTPRequestSettings()) assert (query.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert query.get_groupby_from_ast() == expected_query.get_groupby_from_ast( ) assert query.get_condition_from_ast( ) == expected_query.get_condition_from_ast() assert query.get_arrayjoin_from_ast( ) == expected_query.get_arrayjoin_from_ast() assert query.get_having_from_ast() == expected_query.get_having_from_ast() assert query.get_orderby_from_ast() == expected_query.get_orderby_from_ast( )
def test_no_split(dataset_name: str, id_column: str, project_column: str, timestamp_column: str) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None: state.set_config("max_group_ids_exclude", 5) set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS) PostReplacementConsistencyEnforcer( "project_id", ReplacerState.EVENTS ).process_query(query, HTTPRequestSettings()) assert query.get_condition_from_ast() == FunctionCall( None, BooleanFunctions.AND, ( FunctionCall( None, "notIn", ( FunctionCall( None, "assumeNotNull", (Column(None, None, "group_id"),) ), FunctionCall( None, "tuple", (Literal(None, 100), Literal(None, 101), Literal(None, 102),), ), ), ), build_in("project_id", [2]), ), ) assert not query.get_from_clause().final
def test_format_expressions(pre_format: Query, expected_query: Query) -> None: copy = deepcopy(pre_format) BasicFunctionsProcessor().process_query(copy, HTTPRequestSettings()) assert (copy.get_selected_columns_from_ast() == expected_query.get_selected_columns_from_ast()) assert copy.get_groupby_from_ast() == expected_query.get_groupby_from_ast() assert copy.get_condition_from_ast( ) == expected_query.get_condition_from_ast()
def test_project_extension_project_rate_limits_are_overridden() -> None: extension = ProjectExtension(project_column="project_id") raw_data = {"project": [3, 4]} valid_data = validate_jsonschema(raw_data, extension.get_schema()) query = Query({"conditions": []}, QueryEntity(EntityKey.EVENTS, ColumnSet([]))) request_settings = HTTPRequestSettings() state.set_config("project_per_second_limit_3", 5) state.set_config("project_concurrent_limit_3", 10) extension.get_processor().process_query(query, valid_data, request_settings) rate_limits = request_settings.get_rate_limit_params() most_recent_rate_limit = rate_limits[-1] assert most_recent_rate_limit.bucket == "3" assert most_recent_rate_limit.per_second_limit == 5 assert most_recent_rate_limit.concurrent_limit == 10
def test_alias_validation(query_body: MutableMapping[str, Any], expected_result: bool) -> None: events = get_dataset("events") query = parse_query(query_body, events) query_plan = events.get_query_plan_builder().build_plan( Request("", query, HTTPRequestSettings(), {}, "")) assert query_plan.query.validate_aliases() == expected_result
def test_tags_processor(query_body, expected_query) -> None: state.set_config("ast_tag_processor_enabled", 1) dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request_settings = HTTPRequestSettings() assert (DictClickhouseQuery( dataset, query, request_settings).format_sql() == expected_query)
def test_transformer() -> None: query = build_query(Literal(None, "10")) TagsTypeTransformer().process_query(query, HTTPRequestSettings()) assert query.get_selected_columns( )[0].expression == SubscriptableReference("_snuba_tags[10]", Column(None, None, "tags"), Literal(None, 10))
def test_format_clickhouse_specific_query() -> None: """ Adds a few of the Clickhosue specific fields to the query. """ query = Query( { "sample": 0.1, "totals": True, "limitby": (10, "environment") }, TableSource("my_table", ColumnSet([])), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")), SelectedExpression("column2", Column(None, "table1", "column2")), ], condition=binary_condition( None, "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, "blabla"), ), groupby=[ Column(None, None, "column1"), Column(None, "table1", "column2") ], having=binary_condition( None, "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, 123), ), order_by=[ OrderBy(OrderByDirection.ASC, Column(None, None, "column1")) ], array_join=Column(None, None, "column1"), ) query.set_final(True) query.set_offset(50) query.set_limit(100) request_settings = HTTPRequestSettings() clickhouse_query = AstSqlQuery(query, request_settings) expected = { "from": "FROM my_table FINAL SAMPLE 0.1", "group": "GROUP BY (column1, table1.column2) WITH TOTALS", "having": "HAVING eq(column1, 123)", "array_join": "ARRAY JOIN column1", "limit": "LIMIT 100 OFFSET 50", "limitby": "LIMIT 10 BY environment", "order": "ORDER BY column1 ASC", "select": "SELECT column1, table1.column2", "where": "WHERE eq(column1, 'blabla')", } assert clickhouse_query.sql_data() == expected
def test_invalid_call(query: Query) -> None: processor = CustomFunction( ColumnSet([("param1", String()), ("param2", UInt(8)), ("other_col", String())]), "f_call", [("param1", ColType({String})), ("param2", ColType({UInt}))], simple_function("f_call_impl(param1, inner_call(param2))"), ) with pytest.raises(InvalidCustomFunctionCall): processor.process_query(query, HTTPRequestSettings())
def test_tags_hash_map(query: ClickhouseQuery, expected_condition: Expression,) -> None: set_config("tags_hash_map_enabled", 1) MappingOptimizer( column_name="tags", hash_map_name="_tags_hash_map", killswitch="tags_hash_map_enabled", ).process_query(query, HTTPRequestSettings()) assert query.get_condition() == expected_condition
def test_condition_enforcer(query: Query, valid: bool) -> None: set_config("mandatory_condition_enforce", 1) request_settings = HTTPRequestSettings(consistent=True) processor = MandatoryConditionEnforcer([OrgIdEnforcer(), ProjectIdEnforcer()]) if valid: processor.process_query(query, request_settings) else: with pytest.raises(AssertionError): processor.process_query(query, request_settings)
def test_sessions_processing() -> None: query_body = { "selected_columns": ["duration_quantiles", "sessions", "users"], "conditions": [ ["org_id", "=", 1], ["project_id", "=", 1], ["started", ">", "2020-01-01 12:00:00"], ], } sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: quantiles = tuple( Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1]) assert query.get_selected_columns() == [ SelectedExpression( "duration_quantiles", CurriedFunctionCall( "_snuba_duration_quantiles", FunctionCall( None, "quantilesIfMerge", quantiles, ), (Column(None, None, "duration_quantiles"), ), ), ), SelectedExpression( "sessions", FunctionCall( "_snuba_sessions", "plus", ( FunctionCall(None, "countIfMerge", (Column(None, None, "sessions"), )), FunctionCall( None, "sumIfMerge", (Column(None, None, "sessions_preaggr"), ), ), ), ), ), SelectedExpression( "users", FunctionCall("_snuba_users", "uniqIfMerge", (Column(None, None, "users"), )), ), ] return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()