def test_serialization() -> None: profile = ClickhouseQueryProfile( time_range=10, table="events", all_columns={"col", "timestamp", "arrayjoin"}, multi_level_condition=True, where_profile=FilterProfile( columns={"timestamp"}, mapping_cols=set(), ), groupby_cols={"col"}, array_join_cols={"arrayjoin"}, ) assert profile.to_dict() == { "time_range": 10, "table": "events", "all_columns": ["arrayjoin", "col", "timestamp"], "multi_level_condition": True, "where_profile": { "columns": ["timestamp"], "mapping_cols": [] }, "groupby_cols": ["col"], "array_join_cols": ["arrayjoin"], }
def generate_profile( query: Union[Query, CompositeQuery[Table]], ) -> ClickhouseQueryProfile: """ Takes a Physical query in, analyzes it and produces the ClickhouseQueryProfile data structure. """ collector = TablesCollector() collector.visit(query) all_condition = collector.get_all_conditions() try: return ClickhouseQueryProfile( time_range=collector.get_max_time_range(), table=",".join(sorted([t for t in collector.get_tables()])), all_columns=_get_all_columns(collector.get_all_raw_columns()), multi_level_condition=collector.has_complex_condition(), where_profile=FilterProfile( columns=_list_columns_in_condition(all_condition), mapping_cols=_list_mappings(all_condition), ), groupby_cols=_list_columns(collector.get_all_groupby()), array_join_cols=_list_columns(collector.get_all_arrayjoin()), ) except Exception: # Should never happen, but it is not worth failing queries while # rolling this out because we cannot build he profile. logger.warning("Failed to build query profile", exc_info=True) return ClickhouseQueryProfile( time_range=-1, table="", all_columns=set(), multi_level_condition=False, where_profile=FilterProfile( columns=set(), mapping_cols=set(), ), groupby_cols=set(), array_join_cols=set(), )
def generate_profile(query: Query) -> ClickhouseQueryProfile: """ Takes a Physical query in, analyzes it and produces the ClickhouseQueryProfile data structure. """ where = query.get_condition_from_ast() groupby = query.get_groupby_from_ast() try: return ClickhouseQueryProfile( time_range=_get_date_range(query), table=_get_table(query), all_columns=_get_all_columns(query), multi_level_condition=_has_complex_conditions(query), where_profile=FilterProfile( columns=_list_columns(where) if where is not None else set(), mapping_cols=_list_mapping(where) if where is not None else set(), ), groupby_cols=_list_groupby_columns(groupby) if groupby is not None else set(), array_join_cols=_list_array_join(query), ) except Exception: # Should never happen, but it is not worth failing queries while # rolling this out because we cannot build he profile. logger.warning("Failed to build query profile", exc_info=True) return ClickhouseQueryProfile( time_range=-1, table="", all_columns=set(), multi_level_condition=False, where_profile=FilterProfile( columns=set(), mapping_cols=set(), ), groupby_cols=set(), array_join_cols=set(), )
], )), ClickhouseQueryProfile( time_range=31, table="events", all_columns={ "timestamp", "column2", "column3", "contexts.key", "tags.key", "tags.value", }, multi_level_condition=False, where_profile=FilterProfile( columns={"timestamp", "tags.key", "tags.value"}, mapping_cols={"tags.key", "tags.value"}, ), groupby_cols={"column2", "column3"}, array_join_cols={"contexts.key"}, ), id="Query with timestamp, tags, and arrayjoin", ), pytest.param( ClickhouseQuery( Query( {}, TableSource("events", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column("column2", None, "column2")), ],
def test_simple() -> None: request_body = { "selected_columns": ["event_id"], "orderby": "event_id", "sample": 0.1, "limit": 100, "offset": 50, "project": 1, } query = Query( Entity(EntityKey.EVENTS, get_entity(EntityKey.EVENTS).get_data_model())) request = Request( id=uuid.UUID("a" * 32).hex, original_body=request_body, query=query, snql_anonymized="", query_settings=HTTPQuerySettings(referrer="search"), attribution_info=AttributionInfo(get_app_id("default"), "search", None, None, None), ) time = TestingClock() timer = Timer("test", clock=time) time.sleep(0.01) message = SnubaQueryMetadata( request=request, start_timestamp=datetime.utcnow() - timedelta(days=3), end_timestamp=datetime.utcnow(), dataset="events", timer=timer, query_list=[ ClickhouseQueryMetadata( sql= "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100", sql_anonymized= "select event_id from sentry_dist sample 0.1 prewhere project_id in ($I) limit 50, 100", start_timestamp=datetime.utcnow() - timedelta(days=3), end_timestamp=datetime.utcnow(), stats={ "sample": 10, "error_code": 386 }, status=QueryStatus.SUCCESS, profile=ClickhouseQueryProfile( time_range=10, table="events", all_columns={"timestamp", "tags"}, multi_level_condition=False, where_profile=FilterProfile( columns={"timestamp"}, mapping_cols={"tags"}, ), groupby_cols=set(), array_join_cols=set(), ), trace_id="b" * 32, ) ], projects={2}, snql_anonymized=request.snql_anonymized, entity=EntityKey.EVENTS.value, ).to_dict() processor = (get_writable_storage(StorageKey.QUERYLOG).get_table_writer(). get_stream_loader().get_processor()) assert processor.process_message( message, KafkaMessageMetadata(0, 0, datetime.now()) ) == InsertBatch( [{ "request_id": str(uuid.UUID("a" * 32)), "request_body": '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}', "referrer": "search", "dataset": "events", "projects": [2], "organization": None, "timestamp": timer.for_json()["timestamp"], "duration_ms": 10, "status": "success", "clickhouse_queries.sql": [ "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100" ], "clickhouse_queries.status": ["success"], "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))], "clickhouse_queries.duration_ms": [0], "clickhouse_queries.stats": ['{"error_code": 386, "sample": 10}'], "clickhouse_queries.final": [0], "clickhouse_queries.cache_hit": [0], "clickhouse_queries.sample": [10.0], "clickhouse_queries.max_threads": [0], "clickhouse_queries.num_days": [10], "clickhouse_queries.clickhouse_table": [""], "clickhouse_queries.query_id": [""], "clickhouse_queries.is_duplicate": [0], "clickhouse_queries.consistent": [0], "clickhouse_queries.all_columns": [["tags", "timestamp"]], "clickhouse_queries.or_conditions": [False], "clickhouse_queries.where_columns": [["timestamp"]], "clickhouse_queries.where_mapping_columns": [["tags"]], "clickhouse_queries.groupby_columns": [[]], "clickhouse_queries.array_join_columns": [[]], }], None, )
def test_simple() -> None: request_body = { "selected_columns": ["event_id"], "orderby": "event_id", "sample": 0.1, "limit": 100, "offset": 50, "project": 1, } query = Query(get_storage(StorageKey.EVENTS).get_schema().get_data_source()) request = Request( uuid.UUID("a" * 32).hex, request_body, query, HTTPRequestSettings(), "search", ) time = TestingClock() timer = Timer("test", clock=time) time.sleep(0.01) message = SnubaQueryMetadata( request=request, dataset="events", timer=timer, query_list=[ ClickhouseQueryMetadata( sql="select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100", stats={"sample": 10}, status=QueryStatus.SUCCESS, profile=ClickhouseQueryProfile( time_range=10, table="events", all_columns={"timestamp", "tags"}, multi_level_condition=False, where_profile=FilterProfile( columns={"timestamp"}, mapping_cols={"tags"}, ), groupby_cols=set(), array_join_cols=set(), ), trace_id="b" * 32, ) ], ).to_dict() processor = ( get_writable_storage(StorageKey.QUERYLOG) .get_table_writer() .get_stream_loader() .get_processor() ) assert processor.process_message( message, KafkaMessageMetadata(0, 0, datetime.now()) ) == InsertBatch( [ { "request_id": str(uuid.UUID("a" * 32)), "request_body": '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}', "referrer": "search", "dataset": "events", "projects": [1], "organization": None, "timestamp": timer.for_json()["timestamp"], "duration_ms": 10, "status": "success", "clickhouse_queries.sql": [ "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100" ], "clickhouse_queries.status": ["success"], "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))], "clickhouse_queries.duration_ms": [0], "clickhouse_queries.stats": ['{"sample": 10}'], "clickhouse_queries.final": [0], "clickhouse_queries.cache_hit": [0], "clickhouse_queries.sample": [10.0], "clickhouse_queries.max_threads": [0], "clickhouse_queries.num_days": [10], "clickhouse_queries.clickhouse_table": [""], "clickhouse_queries.query_id": [""], "clickhouse_queries.is_duplicate": [0], "clickhouse_queries.consistent": [0], "clickhouse_queries.all_columns": [["tags", "timestamp"]], "clickhouse_queries.or_conditions": [False], "clickhouse_queries.where_columns": [["timestamp"]], "clickhouse_queries.where_mapping_columns": [["tags"]], "clickhouse_queries.groupby_columns": [[]], "clickhouse_queries.array_join_columns": [[]], } ], )
), ClickhouseQueryProfile( time_range=31, table="events", all_columns={ "events.timestamp", "events.column2", "events.column3", "events.contexts.key", "events.tags.key", "events.tags.value", }, multi_level_condition=False, where_profile=FilterProfile( columns={ "events.timestamp", "events.tags.key", "events.tags.value" }, mapping_cols={"events.tags.key", "events.tags.value"}, ), groupby_cols={"events.column2", "events.column3"}, array_join_cols={"events.contexts.key"}, ), id="Query with timestamp, tags, and arrayjoin", ), pytest.param( ClickhouseQuery( Table("events", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column("column2", None, "column2")), ], condition=binary_condition(