def test_col_split_conditions( id_column: str, project_column: str, timestamp_column: str, query, expected_result ) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) plan = entity.get_query_plan_builder().build_plan(request) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: return QueryResult( { "data": [ { id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", } ] }, {}, ) assert ( splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None ) == expected_result
def test_no_split(dataset_name: str, id_column: str, project_column: str, timestamp_column: str) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: selected_cols = query.get_selected_columns() assert selected_cols == [ c.expression.column_name for c in query.get_selected_columns_from_ast() or [] if isinstance(c.expression, Column) ] if selected_cols == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_cols == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError(f"Unexpected selected columns: {selected_cols}") events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": list(second_query_data[0].keys()), "conditions": [""], "orderby": "events.event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], )) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def test_col_split_conditions(id_column: str, project_column: str, timestamp_column: str, query, expected_result) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) def do_query(query: ClickhouseQuery, request_settings: RequestSettings = None) -> QueryResult: return QueryResult( { "data": [{ id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", }] }, {}, ) assert (splitter.execute(query, HTTPRequestSettings(), do_query) is not None) == expected_result
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, query_settings: QuerySettings, reader: Reader, ) -> QueryResult: selected_col_names = [ c.expression.column_name for c in query.get_selected_columns() or [] if isinstance(c.expression, Column) ] if selected_col_names == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_col_names == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError( f"Unexpected selected columns: {selected_col_names}") events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], ) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPQuerySettings(), do_query)
def test_set_limit_on_split_query(): storage = get_dataset("events").get_default_entity().get_all_storages()[0] query = ClickhouseQuery( Table("events", storage.get_schema().get_columns()), selected_columns=[ SelectedExpression(col.name, Column(None, None, col.name)) for col in storage.get_schema().get_columns() ], limit=420, ) query_run_count = 0 def do_query(query: ClickhouseQuery, query_settings: QuerySettings) -> QueryResult: nonlocal query_run_count query_run_count += 1 if query_run_count == 1: return QueryResult( result={ "data": [ { "event_id": "a", "project_id": "1", "timestamp": " 2019-10-01 22:33:42", }, { "event_id": "a", "project_id": "1", "timestamp": " 2019-10-01 22:44:42", }, ] }, extra={}, ) else: assert query.get_limit() == 2 return QueryResult({}, {}) ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ).execute(query, HTTPQuerySettings(), do_query) assert query_run_count == 2
def test_no_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str ) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), ) ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
"sentry:dist": "dist", "sentry:user": "******", }, "contexts": {"trace.trace_id": "trace_id"}, } ), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), ArrayJoinKeyValueOptimizer("tags"), UUIDColumnProcessor(set(["event_id", "trace_id"])), EventsBooleanContextsProcessor(), PrewhereProcessor( [ "event_id", "release", "message", "transaction_name", "environment", "project_id", ] ), ], query_splitters=[ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ], )