def col_split(dataset, request: Request, column_split_spec: ColumnSplitSpec, *args, **kwargs): """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id and project_id. - Second query selects all fields for only those events. - Shrink the date range. """ # The query function may mutate the request body during query # evaluation, so we need to copy the body to ensure that the query has # not been modified by the time we're ready to run the full query. minimal_request = copy.deepcopy(request) minimal_request.query.set_selected_columns( column_split_spec.get_min_columns()) result = query_func(dataset, minimal_request, *args, **kwargs) del minimal_request if result.result["data"]: request = copy.deepcopy(request) event_ids = list( set([ event[column_split_spec.id_column] for event in result.result["data"] ])) request.query.add_conditions([(column_split_spec.id_column, "IN", event_ids)]) request.query.set_offset(0) request.query.set_limit(len(event_ids)) project_ids = list( set([ event[column_split_spec.project_column] for event in result.result["data"] ])) request.extensions["project"]["project"] = project_ids timestamp_field = column_split_spec.timestamp_column timestamps = [ event[timestamp_field] for event in result.result["data"] ] request.extensions[ "timeseries"]["from_date"] = util.parse_datetime( min(timestamps)).isoformat() # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. request.extensions["timeseries"]["to_date"] = ( util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat() return query_func(dataset, request, *args, **kwargs)
def validate(self, value) -> Request: value = validate_jsonschema(value, self.__composite_schema) query_body = { key: value.pop(key) for key in self.__query_schema['properties'].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema['properties'].keys() if key in value } extensions = {} for extension_name, extension_schema in self.__extension_schemas.items( ): extensions[extension_name] = { key: value.pop(key) for key in extension_schema['properties'].keys() if key in value } return Request( Query(query_body), RequestSettings(settings['turbo'], settings['consistent'], settings['debug']), extensions)
def test() -> None: cv = threading.Condition() query_result = QueryResult({}, {"stats": {}, "sql": ""}) mock_query_runner = Mock(return_value=query_result) def callback_func(args: List[Tuple[str, QueryResult]]) -> None: with cv: cv.notify() mock_callback = Mock(side_effect=callback_func) query_body = { "selected_columns": ["type", "project_id"], } events = get_dataset("events") query = parse_query(query_body, events) events_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.EVENTS)), ) errors_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.ERRORS)), ) delegator = PipelineDelegator( query_pipeline_builders={ "events": events_pipeline, "errors": errors_pipeline }, selector_func=lambda query, referrer: ("events", ["errors"]), callback_func=mock_callback, ) with cv: request_settings = HTTPRequestSettings() delegator.build_execution_pipeline( Request( "", query_body, query, request_settings, "ref", ), mock_query_runner, ).execute() cv.wait(timeout=5) assert mock_query_runner.call_count == 2 assert mock_callback.call_args == call( query, request_settings, "ref", [ Result("events", query_result, ANY), Result("errors", query_result, ANY) ], )
def test_col_split_conditions( id_column: str, project_column: str, timestamp_column: str, query, expected_result ) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) plan = entity.get_query_plan_builder().build_plan(request) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: return QueryResult( { "data": [ { id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", } ] }, {}, ) assert ( splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None ) == expected_result
def validate(self, value, dataset: Dataset, referrer: str) -> Request: try: value = validate_jsonschema(value, self.__composite_schema) except jsonschema.ValidationError as error: raise JsonSchemaValidationException(str(error)) from error query_body = { key: value.pop(key) for key in self.__query_schema["properties"].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema["properties"].keys() if key in value } extensions = {} for extension_name, extension_schema in self.__extension_schemas.items( ): extensions[extension_name] = { key: value.pop(key) for key in extension_schema["properties"].keys() if key in value } query = parse_query(query_body, dataset) request_id = uuid.uuid4().hex return Request(request_id, query, self.__setting_class(**settings), extensions, referrer)
def validate(self, value, dataset: Dataset, referrer: str) -> Request: value = validate_jsonschema(value, self.__composite_schema) query_body = { key: value.pop(key) for key in self.__query_schema["properties"].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema["properties"].keys() if key in value } extensions = {} for extension_name, extension_schema in self.__extension_schemas.items( ): extensions[extension_name] = { key: value.pop(key) for key in extension_schema["properties"].keys() if key in value } query = parse_query(query_body, dataset) return Request(query, self.__setting_class(**settings), extensions, referrer)
def test_select_storage(query_body: MutableMapping[str, Any], is_subscription: bool, expected_table: str) -> None: sessions = get_dataset("sessions") snql_query = json_to_snql(query_body, "sessions") query, snql_anonymized = parse_snql_query(str(snql_query), sessions) query_body = json.loads(snql_query.snuba()) subscription_settings = (SubscriptionQuerySettings if is_subscription else HTTPQuerySettings) request = Request( id="a", original_body=query_body, query=query, snql_anonymized=snql_anonymized, query_settings=subscription_settings(referrer=""), attribution_info=AttributionInfo(get_app_id("default"), "blah", None, None, None), ) def query_runner(query: Query, settings: QuerySettings, reader: Reader) -> QueryResult: assert query.get_from_clause().table_name == expected_table return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()
def test_no_split(dataset_name: str): events = get_dataset(dataset_name) query = Query( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_dataset_schemas().get_read_schema().get_data_source() ) @split_query def do_query(dataset: Dataset, request: Request, timer: Timer): assert request.query == query request = Request( query, RequestSettings(False, False, False), {}, ) do_query(events, request, None)
def test_alias_validation(query_body: MutableMapping[str, Any], expected_result: bool) -> None: events = get_dataset("events") query = parse_query(query_body, events) query_plan = events.get_query_plan_builder().build_plan( Request("", query, HTTPRequestSettings(), {}, "")) assert query_plan.query.validate_aliases() == expected_result
def test_events_processing() -> None: query_body = { "query": """ MATCH (events) SELECT tags[transaction], contexts[browser.name] WHERE project_id = 1 AND timestamp >= toDateTime('2020-01-01 12:00:00') AND timestamp < toDateTime('2020-01-02 12:00:00') """, "dataset": "events", } events_dataset = get_dataset("events") events_entity = events_dataset.get_default_entity() query, snql_anonymized = parse_snql_query(query_body["query"], events_dataset) request = Request( id="", original_body=query_body, query=query, snql_anonymized=snql_anonymized, query_settings=HTTPQuerySettings(referrer=""), attribution_info=AttributionInfo(get_app_id("blah"), "blah", None, None, None), ) def query_runner(query: Query, settings: QuerySettings, reader: Reader) -> QueryResult: assert query.get_selected_columns() == [ SelectedExpression( "tags[transaction]", Column("_snuba_tags[transaction]", None, "transaction_name"), ), SelectedExpression( "contexts[browser.name]", FunctionCall( "_snuba_contexts[browser.name]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "browser.name"), ), ), ), ), ), ] return QueryResult({}, {}) events_entity.get_query_pipeline_builder().build_execution_pipeline( request, query_runner).execute()
def test_sessions_processing() -> None: query_body = { "selected_columns": ["duration_quantiles", "sessions", "users"], "conditions": [ ["org_id", "=", 1], ["project_id", "=", 1], ["started", ">", "2020-01-01 12:00:00"], ], } sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: quantiles = tuple( Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1]) assert query.get_selected_columns() == [ SelectedExpression( "duration_quantiles", CurriedFunctionCall( "_snuba_duration_quantiles", FunctionCall( None, "quantilesIfMerge", quantiles, ), (Column(None, None, "duration_quantiles"), ), ), ), SelectedExpression( "sessions", FunctionCall( "_snuba_sessions", "plus", ( FunctionCall(None, "countIfMerge", (Column(None, None, "sessions"), )), FunctionCall( None, "sumIfMerge", (Column(None, None, "sessions_preaggr"), ), ), ), ), ), SelectedExpression( "users", FunctionCall("_snuba_users", "uniqIfMerge", (Column(None, None, "users"), )), ), ] return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()
def col_split(dataset, request: Request, *args, **kwargs): """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id and project_id. - Second query selects all fields for only those events. - Shrink the date range. """ # The query function may mutate the request body during query # evaluation, so we need to copy the body to ensure that the query has # not been modified by the time we're ready to run the full query. minimal_request = copy.deepcopy(request) minimal_request.query.set_selected_columns(MIN_COLS) result, status = query_func(dataset, minimal_request, *args, **kwargs) del minimal_request # If something failed, just return if status != 200: return result, status if result['data']: request = copy.deepcopy(request) event_ids = list( set([event['event_id'] for event in result['data']])) request.query.add_conditions([('event_id', 'IN', event_ids)]) request.query.set_offset(0) request.query.set_limit(len(event_ids)) project_ids = list( set([event['project_id'] for event in result['data']])) request.extensions['project']['project'] = project_ids timestamps = [event['timestamp'] for event in result['data']] request.extensions[ 'timeseries']['from_date'] = util.parse_datetime( min(timestamps)).isoformat() # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. request.extensions['timeseries']['to_date'] = ( util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat() return query_func(dataset, request, *args, **kwargs)
def test_tags_processor(query_body, expected_query) -> None: state.set_config("ast_tag_processor_enabled", 1) dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request_settings = HTTPRequestSettings() request = Request("a", query, request_settings, {}, "r") _ = dataset.get_query_plan_builder().build_plan(request) assert (DictClickhouseQuery( dataset, query, request_settings).format_sql() == expected_query)
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery: dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request = Request("a", query, HTTPRequestSettings(), {}, "r") for p in dataset.get_query_processors(): p.process_query(query, request.settings) plan = dataset.get_query_plan_builder().build_plan(request) ArrayJoinKeyValueOptimizer("tags").process_query(plan.query, request.settings) return plan.query
def validate( self, value: MutableMapping[str, Any], dataset: Dataset, referrer: str ) -> Request: try: value = validate_jsonschema(value, self.__composite_schema) except jsonschema.ValidationError as error: raise JsonSchemaValidationException(str(error)) from error query_body = { key: value.pop(key) for key in self.__query_schema["properties"].keys() if key in value } settings = { key: value.pop(key) for key in self.__settings_schema["properties"].keys() if key in value } class_name = self.__setting_class if isinstance(class_name, type(HTTPRequestSettings)): settings_obj: Union[ HTTPRequestSettings, SubscriptionRequestSettings ] = class_name(**settings) elif isinstance(class_name, type(SubscriptionRequestSettings)): settings_obj = class_name() extensions = {} for extension_name, extension_schema in self.__extension_schemas.items(): extensions[extension_name] = { key: value.pop(key) for key in extension_schema["properties"].keys() if key in value } if self.__language == Language.SNQL: query = parse_snql_query(query_body["query"], dataset) else: query = parse_query(query_body, dataset) apply_query_extensions(query, extensions, settings_obj) request_id = uuid.uuid4().hex return Request( request_id, # TODO: Replace this with the actual query raw body. # this can have an impact on subscriptions so we need # to be careful with the change. ChainMap(query_body, *extensions.values()), query, settings_obj, referrer, )
def test_select_storage(query_body: MutableMapping[str, Any], expected_table: str) -> None: sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: assert query.get_from_clause().table_name == expected_table return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()
def build_request( body: MutableMapping[str, Any], parser: Parser, settings_class: Union[Type[HTTPRequestSettings], Type[SubscriptionRequestSettings]], schema: RequestSchema, dataset: Dataset, timer: Timer, referrer: str, ) -> Request: with sentry_sdk.start_span(description="build_request", op="validate") as span: try: request_parts = schema.validate(body) if settings_class == HTTPRequestSettings: settings = { **request_parts.settings, "consistent": _consistent_override( request_parts.settings.get("consistent", False), referrer ), } settings_obj: Union[ HTTPRequestSettings, SubscriptionRequestSettings ] = settings_class(**settings) elif settings_class == SubscriptionRequestSettings: settings_obj = settings_class( consistent=_consistent_override(True, referrer) ) query = parser(request_parts, settings_obj, dataset) request_id = uuid.uuid4().hex request = Request( request_id, # TODO: Replace this with the actual query raw body. # this can have an impact on subscriptions so we need # to be careful with the change. ChainMap(request_parts.query, *request_parts.extensions.values()), query, settings_obj, referrer, ) except (InvalidJsonRequestException, InvalidQueryException) as exception: record_invalid_request(timer, referrer) raise exception except Exception as exception: record_error_building_request(timer, referrer) raise exception span.set_data("snuba_query", request.body) timer.mark("validate_schema") return request
def test_events_processing() -> None: query_body = { "selected_columns": ["tags[transaction]", "contexts[browser.name]"] } events_dataset = get_dataset("events") events_entity = events_dataset.get_default_entity() events_storage = events_entity.get_writable_storage() query = parse_query(query_body, events_dataset) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: if events_storage.get_storage_key() == StorageKey.EVENTS: transaction_col_name = "transaction" else: transaction_col_name = "transaction_name" assert query.get_selected_columns_from_ast() == [ SelectedExpression( "tags[transaction]", Column("_snuba_tags[transaction]", None, transaction_col_name), ), SelectedExpression( "contexts[browser.name]", FunctionCall( "_snuba_contexts[browser.name]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "browser.name"), ), ), ), ), ), ] return QueryResult({}, {}) events_entity.get_query_pipeline_builder().build_execution_pipeline( request, query_runner).execute()
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery: dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request = Request("a", query_body, query, HTTPRequestSettings(), "r") entity = get_entity(query.get_from_clause().key) for p in entity.get_query_processors(): p.process_query(query, request.settings) ArrayJoinKeyValueOptimizer("tags").process_query(query, request.settings) query_plan = SingleStorageQueryPlanBuilder( storage=entity.get_writable_storage(), mappers=transaction_translator, ).build_and_rank_plans(query, request.settings)[0] return query_plan.query
def test_col_split( dataset_name: str, first_query_data: Mapping[str, Any], second_query_data: Mapping[str, Any], ): @split_query def do_query(dataset: Dataset, request: Request, timer: Timer): selected_cols = request.query.get_selected_columns() if selected_cols == list(first_query_data[0].keys()): return RawQueryResult({"data": first_query_data}, {}) elif selected_cols == list(second_query_data[0].keys()): return RawQueryResult({"data": second_query_data}, {}) else: raise ValueError(f"Unexpected selected columns: {selected_cols}") events = get_dataset(dataset_name) query = Query( { "selected_columns": list(second_query_data[0].keys()), "conditions": [""], "orderby": "events.event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages() [0].get_schemas().get_read_schema().get_data_source(), ) request = Request( uuid.uuid4().hex, query, HTTPRequestSettings(), { "project": { "project": 1 }, "timeseries": { "from_date": "2019-09-19T10:00:00", "to_date": "2019-09-19T12:00:00", "granularity": 3600, }, }, "tests", ) do_query(events, request, None)
def test_nested_optimizer(query_body, expected_condition) -> None: transactions = get_dataset("transactions") query = parse_query(query_body, transactions) request_settings = HTTPRequestSettings() request = Request("", query, request_settings, {}, "") query_plan = transactions.get_query_plan_builder().build_plan(request) processor = NestedFieldConditionOptimizer( nested_col="tags", flattened_col="tags_map", timestamp_cols={"start_ts", "finish_ts"}, beginning_of_time=datetime(2019, 12, 11, 0, 0, 0), ) clickhouse_query = query_plan.query processor.process_query(clickhouse_query, request_settings) assert clickhouse_query.get_conditions() == expected_condition
def run_non_consistent() -> Result: request_copy = Request( id=request.id, body=copy.deepcopy(request.body), query=copy.deepcopy(request.query), settings=SubscriptionRequestSettings(consistent=False), referrer=request.referrer, ) return parse_and_run_query( self.__dataset, request_copy, timer, robust=True, concurrent_queries_gauge=self.__concurrent_clickhouse_gauge if not is_consistent_query else None, ).result
def test_data_source( query_body: MutableMapping[str, Any], expected_table: str, ) -> None: request_settings = HTTPRequestSettings() dataset = get_dataset("discover") query = parse_query(query_body, dataset) request = Request("a", query, request_settings, {}, "r") for processor in get_dataset("discover").get_query_processors(): processor.process_query(request.query, request.settings) plan = dataset.get_query_plan_builder().build_plan(request) for physical_processor in plan.plan_processors: physical_processor.process_query(plan.query, request.settings) assert plan.query.get_data_source().format_from() == expected_table, json.dumps( query_body )
def test_events_processing() -> None: query_body = {"selected_columns": ["tags[transaction]", "contexts[browser.name]"]} events = get_dataset("events") query = parse_query(query_body, events) request = Request("", query, HTTPRequestSettings(), {}, "") query_plan = ( events.get_default_entity().get_query_plan_builder().build_plan(request) ) for clickhouse_processor in query_plan.plan_processors: clickhouse_processor.process_query(query_plan.query, request.settings) def query_runner( query: Query, settings: RequestSettings, reader: Reader[SqlQuery] ) -> QueryResult: assert query.get_selected_columns_from_ast() == [ SelectedExpression( "tags[transaction]", Column("tags[transaction]", None, "transaction") ), SelectedExpression( "contexts[browser.name]", FunctionCall( "contexts[browser.name]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "browser.name"), ), ), ), ), ), ] return QueryResult({}, {}) query_plan.execution_strategy.execute( query_plan.query, request.settings, query_runner )
def test_sessions_processing() -> None: query_body = { "selected_columns": ["duration_quantiles", "sessions", "users"] } sessions = get_dataset("sessions") query = parse_query(query_body, sessions) request = Request("", query, HTTPRequestSettings(), {}, "") query_plan = (sessions.get_default_entity().get_query_plan_builder(). build_plan(request)) for clickhouse_processor in query_plan.plan_processors: clickhouse_processor.process_query(query_plan.query, request.settings) def query_runner(query: Query, settings: RequestSettings, reader: Reader[SqlQuery]) -> QueryResult: assert query.get_selected_columns_from_ast() == [ SelectedExpression( "duration_quantiles", CurriedFunctionCall( "duration_quantiles", FunctionCall( None, "quantilesIfMerge", (Literal(None, 0.5), Literal(None, 0.9)), ), (Column(None, None, "duration_quantiles"), ), ), ), SelectedExpression( "sessions", FunctionCall("sessions", "countIfMerge", (Column(None, None, "sessions"), )), ), SelectedExpression( "users", FunctionCall("users", "uniqIfMerge", (Column(None, None, "users"), )), ), ] return QueryResult({}, {}) query_plan.execution_strategy.execute(query_plan.query, request.settings, query_runner)
def test_col_split( dataset_name: str, first_query_data: Mapping[str, Any], second_query_data: Mapping[str, Any], ): @split_query def do_query(dataset: Dataset, request: Request, timer: Timer): selected_cols = request.query.get_selected_columns() if selected_cols == list(first_query_data[0].keys()): return QueryResult({"data": first_query_data}, 200) elif selected_cols == list(second_query_data[0].keys()): return QueryResult({"data": second_query_data}, 200) else: raise ValueError(f"Unexpected selected columns: {selected_cols}") query = Query({ "selected_columns": list(second_query_data[0].keys()), "conditions": [""], "orderby": "events.event_id", "sample": 10, "limit": 100, "offset": 50, }) request = Request( query, RequestSettings(False, False, False), { "project": { "project": 1 }, "timeseries": { "from_date": "2019-09-19T10:00:00", "to_date": "2019-09-19T12:00:00", "granularity": 3600, } }, ) events = get_dataset(dataset_name) do_query(events, request, None)
def test_no_split(dataset_name: str): events = get_dataset(dataset_name) query = Query( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_all_storages() [0].get_schemas().get_read_schema().get_data_source(), ) @split_query def do_query(dataset: Dataset, request: Request, timer: Timer): assert request.query == query request = Request(uuid.uuid4().hex, query, HTTPRequestSettings(), {}, "tests") do_query(events, request, None)
def test_metrics_processing( entity_name: str, column_name: str, entity_key: EntityKey, translated_value: Expression, ) -> None: settings.ENABLE_DEV_FEATURES = True settings.DISABLED_DATASETS = set() importlib.reload(factory) importlib.reload(storage_factory) importlib.reload(cluster) query_body = { "query": (f"MATCH ({entity_name}) " f"SELECT {column_name} BY org_id, project_id, tags[10] " "WHERE " "timestamp >= toDateTime('2021-05-17 19:42:01') AND " "timestamp < toDateTime('2021-05-17 23:42:01') AND " "org_id = 1 AND " "project_id = 1"), } metrics_dataset = get_dataset("metrics") query = parse_snql_query(query_body["query"], [], metrics_dataset) request = Request("", query_body, query, HTTPRequestSettings(), "") def query_runner(query: Query, settings: RequestSettings, reader: Reader) -> QueryResult: assert query.get_selected_columns() == [ SelectedExpression( "org_id", Column("_snuba_org_id", None, "org_id"), ), SelectedExpression( "project_id", Column("_snuba_project_id", None, "project_id"), ), SelectedExpression( "tags[10]", FunctionCall( "_snuba_tags[10]", "arrayElement", ( Column(None, None, "tags.value"), FunctionCall( None, "indexOf", (Column(None, None, "tags.key"), Literal(None, 10)), ), ), ), ), SelectedExpression( column_name, translated_value, ), ] return QueryResult({}, {}) entity = get_entity(entity_key) entity.get_query_pipeline_builder().build_execution_pipeline( request, query_runner).execute()
def test_simple(): request_body = { "selected_columns": ["event_id"], "orderby": "event_id", "sample": 0.1, "limit": 100, "offset": 50, "project": 1, } query = Query( request_body, get_storage( "events").get_schemas().get_read_schema().get_data_source(), ) request = Request( uuid.UUID("a" * 32).hex, query, HTTPRequestSettings(), {}, "search") time = TestingClock() timer = Timer("test", clock=time) time.sleep(0.01) message = SnubaQueryMetadata( request=request, dataset=get_dataset("events"), timer=timer, query_list=[ ClickhouseQueryMetadata( sql= "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100", stats={"sample": 10}, status="success", trace_id="b" * 32) ]).to_dict() processor = (enforce_table_writer( get_dataset("querylog")).get_stream_loader().get_processor()) assert processor.process_message( message ) == ProcessedMessage(ProcessorAction.INSERT, [{ "request_id": str(uuid.UUID("a" * 32)), "request_body": '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}', "referrer": "search", "dataset": get_dataset("events"), "projects": [1], "organization": None, "timestamp": timer.for_json()["timestamp"], "duration_ms": 10, "status": "success", "clickhouse_queries.sql": [ "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100" ], "clickhouse_queries.status": ["success"], "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))], "clickhouse_queries.duration_ms": [0], "clickhouse_queries.stats": ['{"sample": 10}'], "clickhouse_queries.final": [0], "clickhouse_queries.cache_hit": [0], "clickhouse_queries.sample": [10.], "clickhouse_queries.max_threads": [0], "clickhouse_queries.num_days": [0], "clickhouse_queries.clickhouse_table": [""], "clickhouse_queries.query_id": [""], "clickhouse_queries.is_duplicate": [0], "clickhouse_queries.consistent": [0], }])
def build_request( body: MutableMapping[str, Any], parser: Parser, settings_class: Union[Type[HTTPQuerySettings], Type[SubscriptionQuerySettings]], schema: RequestSchema, dataset: Dataset, timer: Timer, referrer: str, custom_processing: Optional[CustomProcessors] = None, ) -> Request: with sentry_sdk.start_span(description="build_request", op="validate") as span: try: request_parts = schema.validate(body) if settings_class == HTTPQuerySettings: query_settings: MutableMapping[str, bool | str] = { **request_parts.query_settings, "consistent": _consistent_override( request_parts.query_settings.get("consistent", False), referrer ), } query_settings["referrer"] = referrer # TODO: referrer probably doesn't need to be passed in, it should be from the body settings_obj: Union[ HTTPQuerySettings, SubscriptionQuerySettings ] = settings_class( **query_settings, ) elif settings_class == SubscriptionQuerySettings: settings_obj = settings_class( consistent=_consistent_override(True, referrer), ) query, snql_anonymized = parser( request_parts, settings_obj, dataset, custom_processing ) project_ids = get_object_ids_in_query_ast(query, "project_id") if project_ids is not None and len(project_ids) == 1: sentry_sdk.set_tag("snuba_project_id", project_ids.pop()) org_ids = get_object_ids_in_query_ast(query, "org_id") if org_ids is not None and len(org_ids) == 1: sentry_sdk.set_tag("snuba_org_id", org_ids.pop()) attribution_info = dict(request_parts.attribution_info) # TODO: clean this up attribution_info["app_id"] = get_app_id( request_parts.attribution_info["app_id"] ) attribution_info["referrer"] = referrer request_id = uuid.uuid4().hex request = Request( id=request_id, # TODO: Replace this with the actual query raw body. # this can have an impact on subscriptions so we need # to be careful with the change. original_body=body, query=query, attribution_info=AttributionInfo(**attribution_info), query_settings=settings_obj, snql_anonymized=snql_anonymized, ) except (InvalidJsonRequestException, InvalidQueryException) as exception: record_invalid_request(timer, referrer) raise exception except Exception as exception: record_error_building_request(timer, referrer) raise exception span.set_data( "snuba_query_parsed", repr(query).split("\n"), ) span.set_data( "snuba_query_raw", textwrap.wrap(repr(request.original_body), 100, break_long_words=False), ) sentry_sdk.add_breadcrumb( category="query_info", level="info", message="snuba_query_raw", data={ "query": textwrap.wrap( repr(request.original_body), 100, break_long_words=False ) }, ) timer.mark("validate_schema") return request