def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def query() -> ClickhouseQuery: return ClickhouseQuery( LogicalQuery( {"conditions": [("project_id", "IN", [2])]}, TableSource("my_table", ColumnSet([])), condition=build_in("project_id", [2]), ))
def test_no_split(dataset_name: str, id_column: str, project_column: str, timestamp_column: str) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def query_with_timestamp() -> ClickhouseQuery: return ClickhouseQuery( Table("my_table", ColumnSet([])), condition=build_and( build_in("project_id", [2]), build_time_range(datetime(2021, 1, 1), datetime(2021, 1, 2)), ), )
def query_with_future_timestamp() -> ClickhouseQuery: return ClickhouseQuery( Table("my_table", ColumnSet([])), condition=build_and( build_in("project_id", [2]), build_time_range(datetime.now() + timedelta(days=1), datetime.now() + timedelta(days=2)), ), )
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: selected_cols = query.get_selected_columns() assert selected_cols == [ c.expression.column_name for c in query.get_selected_columns_from_ast() or [] if isinstance(c.expression, Column) ] if selected_cols == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_cols == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError(f"Unexpected selected columns: {selected_cols}") events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": list(second_query_data[0].keys()), "conditions": [""], "orderby": "events.event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], )) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def test_when_sample_is_not_provided_without_turbo(self): query = Query({ "conditions": [], "aggregations": [], "groupby": [], }) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) clickhouse_query = ClickhouseQuery( dataset=self.dataset, query=query, settings=request_settings, prewhere_conditions=[], ) assert 'SAMPLE' not in clickhouse_query.format_sql()
def test_provided_sample_should_be_used_with_turbo(self): query = Query({ "conditions": [], "aggregations": [], "groupby": [], "sample": 0.1 }) request_settings = RequestSettings(turbo=True, consistent=False, debug=False) clickhouse_query = ClickhouseQuery( dataset=self.dataset, query=query, settings=request_settings, prewhere_conditions=[], ) assert 'SAMPLE 0.1' in clickhouse_query.format_sql()
def test_format_clickhouse_specific_query() -> None: """ Adds a few of the Clickhosue specific fields to the query. """ query = ClickhouseQuery( TableSource("my_table", ColumnSet([])), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")), SelectedExpression("column2", Column(None, "table1", "column2")), ], condition=binary_condition( None, "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, "blabla"), ), groupby=[ Column(None, None, "column1"), Column(None, "table1", "column2") ], having=binary_condition( None, "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, 123), ), order_by=[ OrderBy(OrderByDirection.ASC, Column(None, None, "column1")) ], array_join=Column(None, None, "column1"), sample=0.1, totals=True, limitby=(10, "environment"), ) query.set_final(True) query.set_offset(50) query.set_limit(100) request_settings = HTTPRequestSettings() clickhouse_query = AstSqlQuery(query, request_settings) expected = { "from": "FROM my_table FINAL SAMPLE 0.1", "group": "GROUP BY (column1, table1.column2) WITH TOTALS", "having": "HAVING eq(column1, 123)", "array_join": "ARRAY JOIN column1", "limit": "LIMIT 100 OFFSET 50", "limitby": "LIMIT 10 BY environment", "order": "ORDER BY column1 ASC", "select": "SELECT column1, table1.column2", "where": "WHERE eq(column1, 'blabla')", } assert clickhouse_query.sql_data() == expected
def parse_and_run_query(dataset, request: Request, timer) -> QueryResult: from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit( request.extensions['timeseries']) extensions = dataset.get_extensions() for name, extension in extensions.items(): extension.get_processor().process_query(request.query, request.extensions[name], request.settings) request.query.add_conditions(dataset.default_conditions()) if request.settings.get_turbo(): request.query.set_final(False) prewhere_conditions = [] # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in dataset.get_prewhere_keys() prewhere_candidates = [(util.columns_in_expr(cond[0]), cond) for cond in request.query.get_conditions() if util.is_condition(cond) and any( col in dataset.get_prewhere_keys() for col in util.columns_in_expr(cond[0]))] # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) prewhere_candidates = sorted( [(min(dataset.get_prewhere_keys().index(col) for col in cols if col in dataset.get_prewhere_keys()), cond) for cols, cond in prewhere_candidates], key=lambda priority_and_col: priority_and_col[0]) if prewhere_candidates: prewhere_conditions = [cond for _, cond in prewhere_candidates ][:settings.MAX_PREWHERE_CONDITIONS] request.query.set_conditions( list( filter(lambda cond: cond not in prewhere_conditions, request.query.get_conditions()))) source = dataset.get_dataset_schemas().get_read_schema().get_data_source() # TODO: consider moving the performance logic and the pre_where generation into # ClickhouseQuery since they are Clickhouse specific query = ClickhouseQuery(dataset, request.query, request.settings, prewhere_conditions) timer.mark('prepare_query') stats = { 'clickhouse_table': source, 'final': request.query.get_final(), 'referrer': http_request.referrer, 'num_days': (to_date - from_date).days, 'sample': request.query.get_sample(), } return raw_query(request, query, clickhouse_ro, timer, stats)
def test_find_projects(query_body: MutableMapping[str, Any], expected_projects: Set[int]) -> None: events = get_dataset("events") query = parse_query(query_body, events) query = ClickhouseQuery(query) project_ids = get_project_ids_in_query(query, "project_id") assert project_ids == expected_projects project_ids_ast = get_project_ids_in_query_ast(query, "project_id") assert project_ids_ast == expected_projects
def test_format_clickhouse_specific_query() -> None: """ Adds a few of the Clickhosue specific fields to the query. """ query = ClickhouseQuery( Table("my_table", ColumnSet([]), final=True, sampling_rate=0.1), selected_columns=[ SelectedExpression("column1", Column(None, None, "column1")), SelectedExpression("column2", Column(None, "table1", "column2")), ], condition=binary_condition( "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, "blabla"), ), groupby=[ Column(None, None, "column1"), Column(None, "table1", "column2") ], having=binary_condition( "eq", lhs=Column(None, None, "column1"), rhs=Literal(None, 123), ), order_by=[ OrderBy(OrderByDirection.ASC, Column(None, None, "column1")) ], array_join=Column(None, None, "column1"), totals=True, limitby=LimitBy(10, Column(None, None, "environment")), ) query.set_offset(50) query.set_limit(100) request_settings = HTTPRequestSettings() clickhouse_query = format_query(query, request_settings) expected = [ "SELECT column1, table1.column2", ["FROM", "my_table FINAL SAMPLE 0.1"], "ARRAY JOIN column1", "WHERE eq(column1, 'blabla')", "GROUP BY column1, table1.column2 WITH TOTALS", "HAVING eq(column1, 123)", "ORDER BY column1 ASC", "LIMIT 10 BY environment", "LIMIT 100 OFFSET 50", ] assert clickhouse_query.structured() == expected
def test_time_split_ast() -> None: """ Test that the time split transforms the query properly both on the old representation and on the AST representation. """ found_timestamps = [] def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance(from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) found_timestamps.append((from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {}) body = { "selected_columns": [ "event_id", "level", "logger", "server_name", "transaction", "timestamp", "project_id", ], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), ("project_id", "IN", [1]), ], "limit": 10, "orderby": ["-timestamp"], } events = get_dataset("events") query = parse_query(body, events) settings = HTTPRequestSettings() for p in events.get_entity(None).get_query_processors(): p.process_query(query, settings) splitter = TimeSplitQueryStrategy("timestamp") splitter.execute(ClickhouseQuery(query), settings, do_query) assert found_timestamps == [ ("2019-09-19T11:00:00", "2019-09-19T12:00:00"), ("2019-09-19T01:00:00", "2019-09-19T11:00:00"), ("2019-09-18T10:00:00", "2019-09-19T01:00:00"), ]
def build_query( selected_columns: Optional[Sequence[Expression]] = None, condition: Optional[Expression] = None, having: Optional[Expression] = None, ) -> ClickhouseQuery: return ClickhouseQuery( Table("test", ColumnSet([])), selected_columns=[ SelectedExpression(name=s.alias, expression=s) for s in selected_columns or [] ], condition=condition, having=having, )
def test_when_sample_is_not_provided_with_turbo(self): source = self.dataset.get_dataset_schemas().get_read_schema( ).get_data_source() query = Query( { "conditions": [], "aggregations": [], "groupby": [], }, source, ) request_settings = RequestSettings(turbo=True, consistent=False, debug=False) clickhouse_query = ClickhouseQuery( dataset=self.dataset, query=query, settings=request_settings, prewhere_conditions=[], ) assert "SAMPLE 0.2" in clickhouse_query.format_sql()
def build_clickhouse_node( alias: str, from_clause: Table, selected_columns: Sequence[SelectedExpression], condition: Optional[Expression], ) -> IndividualNode[Table]: return IndividualNode( alias=alias, data_source=ClickhouseQuery( from_clause=from_clause, selected_columns=selected_columns, condition=condition, ), )
def test_col_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str, first_query_data: Sequence[MutableMapping[str, Any]], second_query_data: Sequence[MutableMapping[str, Any]], ) -> None: def do_query( query: ClickhouseQuery, query_settings: QuerySettings, reader: Reader, ) -> QueryResult: selected_col_names = [ c.expression.column_name for c in query.get_selected_columns() or [] if isinstance(c.expression, Column) ] if selected_col_names == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, {}) elif selected_col_names == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, {}) else: raise ValueError( f"Unexpected selected columns: {selected_col_names}") events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), selected_columns=[ SelectedExpression(name=col_name, expression=Column(None, None, col_name)) for col_name in second_query_data[0].keys() ], ) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy(id_column, project_column, timestamp_column), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPQuerySettings(), do_query)
def test_provided_sample_should_be_used(self): source = self.dataset.get_dataset_schemas().get_read_schema( ).get_data_source() query = Query( { "conditions": [], "aggregations": [], "groupby": [], "sample": 0.1 }, source, ) request_settings = RequestSettings(turbo=False, consistent=False, debug=False) clickhouse_query = ClickhouseQuery( dataset=self.dataset, query=query, settings=request_settings, prewhere_conditions=[], ) assert 'SAMPLE 0.1' in clickhouse_query.format_sql()
def build_query( selected_columns: Optional[Sequence[Expression]] = None, condition: Optional[Expression] = None, having: Optional[Expression] = None, ) -> ClickhouseQuery: return ClickhouseQuery( SnubaQuery( {}, None, selected_columns=[ SelectedExpression(name=s.alias, expression=s) for s in selected_columns or [] ], condition=condition, having=having, ))
def test_set_limit_on_split_query(): storage = get_dataset("events").get_default_entity().get_all_storages()[0] query = ClickhouseQuery( Table("events", storage.get_schema().get_columns()), selected_columns=[ SelectedExpression(col.name, Column(None, None, col.name)) for col in storage.get_schema().get_columns() ], limit=420, ) query_run_count = 0 def do_query(query: ClickhouseQuery, query_settings: QuerySettings) -> QueryResult: nonlocal query_run_count query_run_count += 1 if query_run_count == 1: return QueryResult( result={ "data": [ { "event_id": "a", "project_id": "1", "timestamp": " 2019-10-01 22:33:42", }, { "event_id": "a", "project_id": "1", "timestamp": " 2019-10-01 22:44:42", }, ] }, extra={}, ) else: assert query.get_limit() == 2 return QueryResult({}, {}) ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ).execute(query, HTTPQuerySettings(), do_query) assert query_run_count == 2
def identity_translate(query: LogicalQuery) -> ClickhouseQuery: """ Utility method to build a Clickhouse Query from a Logical Query without transforming anything. It is exposed by this module because it is often useful in tests. """ return ClickhouseQuery( from_clause=None, selected_columns=query.get_selected_columns(), array_join=query.get_arrayjoin(), condition=query.get_condition(), groupby=query.get_groupby(), having=query.get_having(), order_by=query.get_orderby(), limitby=query.get_limitby(), limit=query.get_limit(), offset=query.get_offset(), totals=query.has_totals(), granularity=query.get_granularity(), )
def test_no_split( dataset_name: str, id_column: str, project_column: str, timestamp_column: str ) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( LogicalQuery( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages()[0].get_schema().get_data_source(), ) ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader[SqlQuery], ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def execute( self, query: ClickhouseQuery, # TODO: move Clickhouse specific arguments into DictClickhouseQuery settings: Optional[Mapping[str, str]] = None, query_id: Optional[str] = None, with_totals: bool = False, ) -> Result: if settings is None: settings = {} kwargs = {} if query_id is not None: kwargs["query_id"] = query_id sql = query.format_sql() return self.__transform_result( self.__client.execute( sql, with_column_types=True, settings=settings, **kwargs ), with_totals=with_totals, )
test_cases = [ ( "not promoted", ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "tags[foo]", FunctionCall( "tags[foo]", "arrayValue", ( Column(None, None, "tags.value"), FunctionCall( None, "indexOf", ( Column(None, None, "tags.key"), Literal(None, "foo"), ), ), ), ), ) ], ), ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression(
def query() -> ClickhouseQuery: return ClickhouseQuery( Table("my_table", ColumnSet([])), condition=build_in("project_id", [2]), )
from_clause=ClickhouseQuery( from_clause=events_table, selected_columns=[ SelectedExpression("project_id", Column(None, None, "project_id")), SelectedExpression( "count_environment", FunctionCall( "count_environment", function_name="ifNull", parameters=( FunctionCall( None, "uniq", (build_mapping_expr( None, None, "tags", Literal(None, "environment"), ), ), ), Literal(alias=None, value=0), ), ), ), ], groupby=[Column(None, None, "project_id")], condition=binary_condition( BooleanFunctions.AND, binary_condition( ConditionFunctions.EQ, Column(None, None, "project_id"), Literal(None, 1), ), binary_condition( ConditionFunctions.GTE, Column(None, None, "timestamp"), Literal(None, datetime(2020, 1, 1, 12, 0)), ), ), ),
ClickhouseQuery( Query( {}, TableSource("events", ColumnSet([])), selected_columns=[ SelectedExpression("column2", Column("column2", None, "column2")), SelectedExpression( "something", FunctionCall( "something", "arrayJoin", (Column(None, None, "contexts.key"), ), ), ), ], condition=binary_condition( None, BooleanFunctions.AND, binary_condition( None, ConditionFunctions.GTE, Column(None, None, "timestamp"), Literal(None, datetime(2020, 8, 1)), ), binary_condition( None, BooleanFunctions.AND, binary_condition( None, ConditionFunctions.LT, Column(None, None, "timestamp"), Literal(None, datetime(2020, 9, 1)), ), binary_condition( None, ConditionFunctions.EQ, build_mapping_expr( "tags[asd]", None, "tags", Literal(None, "asd"), ), Literal(None, "sdf"), ), ), ), groupby=[ Column("column2", None, "column2"), Column("column3", None, "column3"), ], )),
expression=SubscriptableReference( None, Column(None, None, "tags"), Literal(None, "myTag") ), ), ], ), ClickhouseQuery( from_clause=Table("my_table", ColumnSet([])), selected_columns=[ SelectedExpression("alias", Column("alias", "table", "column")), SelectedExpression( "alias2", FunctionCall( "alias2", "f1", (Column(None, None, "column2"), Column(None, None, "column3"),), ), ), SelectedExpression( name=None, expression=SubscriptableReference( None, Column(None, None, "tags"), Literal(None, "myTag") ), ), ], ), id="default - no change", ), pytest.param( TranslationMappers( columns=[ColumnToColumn(None, "column2", None, "not_column2")], subscriptables=[SubscriptableMapper(None, "tags", None, "tags")],
def query_with_multiple_group_ids() -> ClickhouseQuery: return ClickhouseQuery( Table("my_table", ColumnSet([])), condition=build_group_ids_condition(), )
from_clause=ClickhouseQuery( from_clause=events_table, selected_columns=[ SelectedExpression("project_id", Column(None, None, "project_id")), SelectedExpression( "count_environment", FunctionCall( "count_environment", function_name="ifNull", parameters=( FunctionCall( None, "uniq", (build_mapping_expr( None, None, "tags", Literal(None, "environment"), ), ), ), Literal(alias=None, value=0), ), ), ), ], groupby=[Column(None, None, "project_id")], condition=binary_condition( ConditionFunctions.EQ, Column(None, None, "project_id"), Literal(None, 1), ), ),