def build_request( self, dataset: Dataset, timestamp: datetime, offset: Optional[int], timer: Timer, metrics: Optional[MetricsBackend] = None, referrer: str = SUBSCRIPTION_REFERRER, ) -> Request: schema = RequestSchema.build(SubscriptionQuerySettings) request = build_request( {"query": self.query}, parse_snql_query, SubscriptionQuerySettings, schema, dataset, timer, referrer, [ self.entity_subscription.validate_query, partial(self.add_conditions, timestamp, offset), ], ) return request
def sdk_distribution(*, timer: Timer): request = validate_request_content( parse_request_body(http_request), RequestSchema( schemas.SDK_STATS_BASE_SCHEMA, SETTINGS_SCHEMA, schemas.SDK_STATS_EXTENSIONS_SCHEMA, ), timer, ) request.query.set_aggregations([ ['uniq', 'project_id', 'projects'], ['count()', None, 'count'], ]) request.query.add_groupby(['sdk_name', 'rtime']) request.extensions['project'] = { 'project': [], } dataset = get_dataset('events') ensure_table_exists(dataset) query_result = parse_and_run_query(dataset, request, timer) return (json.dumps(query_result.result, for_json=True, default=lambda obj: obj.isoformat() if isinstance(obj, datetime) else obj), query_result.status, { 'Content-Type': 'application/json' })
def test_split_request() -> None: payload = { "turbo": False, "consistent": False, "debug": False, "dry_run": False, "legacy": False, "team": "sns", "feature": "attribution", "app_id": "foobar", "query": """MATCH (something) dontcare""", } schema = RequestSchema.build(HTTPQuerySettings) parts = schema.validate(payload) assert set(parts.query_settings.keys()) == { "turbo", "consistent", "debug", "dry_run", "legacy", "referrer", } assert set(parts.attribution_info.keys()) == { "team", "feature", "app_id", "parent_api", "referrer", } assert set(parts.query.keys()) == {"query"}
def build_request( self, dataset: Dataset, timestamp: datetime, offset: Optional[int], timer: Timer, metrics: Optional[MetricsBackend] = None, ) -> Request: schema = RequestSchema.build_with_extensions( {}, SubscriptionRequestSettings, Language.SNQL, ) request = build_request( {"query": self.query}, partial( parse_snql_query, [ self.validate_subscription, partial(self.add_conditions, timestamp, offset), ], ), SubscriptionRequestSettings, schema, dataset, timer, SUBSCRIPTION_REFERRER, ) return request
def build_request( self, dataset: Dataset, timestamp: datetime, offset: Optional[int], timer: Timer ) -> Request: """ Returns a Request that can be used to run a query via `parse_and_run_query`. :param dataset: The Dataset to build the request for :param timestamp: Date that the query should run up until :param offset: Maximum offset we should query for """ schema = RequestSchema.build_with_extensions( dataset.get_extensions(), SubscriptionRequestSettings, ) extra_conditions: Sequence[Condition] = [] if offset is not None: extra_conditions = [[["ifnull", ["offset", 0]], "<=", offset]] return validate_request_content( { "project": self.project_id, "conditions": [*self.conditions, *extra_conditions], "aggregations": self.aggregations, "from_date": (timestamp - self.time_window).isoformat(), "to_date": timestamp.isoformat(), }, schema, timer, dataset, SUBSCRIPTION_REFERRER, )
def validate_request_content(body, schema: RequestSchema, timer) -> Request: try: request = schema.validate(body) except jsonschema.ValidationError as error: raise BadRequest(str(error)) from error timer.mark('validate_schema') return request
def validate_request_content(body, schema: RequestSchema, timer, dataset: Dataset) -> Request: source = dataset.get_dataset_schemas().get_read_schema().get_data_source() try: request = schema.validate(body, source) except jsonschema.ValidationError as error: raise BadRequest(str(error)) from error timer.mark('validate_schema') return request
def validate_request_content(body, schema: RequestSchema, timer: Timer, dataset: Dataset, referrer: str) -> Request: with sentry_sdk.start_span(description="validate_request_content", op="validate") as span: try: request = schema.validate(body, dataset, referrer) span.set_data("snuba_query", request.body) except jsonschema.ValidationError as error: raise BadRequest(str(error)) from error timer.mark("validate_schema") return request
def snql_dataset_query_view(*, dataset: Dataset, timer: Timer) -> Union[Response, str]: if http_request.method == "GET": schema = RequestSchema.build(HTTPQuerySettings) return render_template( "query.html", query_template=json.dumps(schema.generate_template(), indent=4), ) elif http_request.method == "POST": body = parse_request_body(http_request) _trace_transaction(dataset) return dataset_query(dataset, body, timer) else: assert False, "unexpected fallthrough"
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response: assert http_request.method == "POST" with sentry_sdk.start_span(description="build_schema", op="validate"): schema = RequestSchema.build_with_extensions( dataset.get_extensions(), HTTPRequestSettings ) request = build_request(body, schema, timer, dataset, http_request.referrer) try: result = parse_and_run_query(dataset, request, timer) except QueryException as exception: status = 500 details: Mapping[str, Any] cause = exception.__cause__ if isinstance(cause, RateLimitExceeded): status = 429 details = { "type": "rate-limited", "message": "rate limit exceeded", } elif isinstance(cause, ClickhouseError): details = { "type": "clickhouse", "message": str(cause), "code": cause.code, } elif isinstance(cause, Exception): details = { "type": "unknown", "message": str(cause), } else: raise # exception should have been chained return Response( json.dumps( {"error": details, "timing": timer.for_json(), **exception.extra} ), status, {"Content-Type": "application/json"}, ) payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()} if settings.STATS_IN_RESPONSE or request.settings.get_debug(): payload.update(result.extra) return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
def build_request( body: MutableMapping[str, Any], parser: Parser, settings_class: Union[Type[HTTPRequestSettings], Type[SubscriptionRequestSettings]], schema: RequestSchema, dataset: Dataset, timer: Timer, referrer: str, ) -> Request: with sentry_sdk.start_span(description="build_request", op="validate") as span: try: request_parts = schema.validate(body) if settings_class == HTTPRequestSettings: settings = { **request_parts.settings, "consistent": _consistent_override( request_parts.settings.get("consistent", False), referrer ), } settings_obj: Union[ HTTPRequestSettings, SubscriptionRequestSettings ] = settings_class(**settings) elif settings_class == SubscriptionRequestSettings: settings_obj = settings_class( consistent=_consistent_override(True, referrer) ) query = parser(request_parts, settings_obj, dataset) request_id = uuid.uuid4().hex request = Request( request_id, # TODO: Replace this with the actual query raw body. # this can have an impact on subscriptions so we need # to be careful with the change. ChainMap(request_parts.query, *request_parts.extensions.values()), query, settings_obj, referrer, ) except (InvalidJsonRequestException, InvalidQueryException) as exception: record_invalid_request(timer, referrer) raise exception except Exception as exception: record_error_building_request(timer, referrer) raise exception span.set_data("snuba_query", request.body) timer.mark("validate_schema") return request
def dataset_query_view(*, dataset: Dataset, timer: Timer): if http_request.method == "GET": schema = RequestSchema.build_with_extensions( dataset.get_extensions(), HTTPRequestSettings ) return render_template( "query.html", query_template=json.dumps(schema.generate_template(), indent=4,), ) elif http_request.method == "POST": body = parse_request_body(http_request) return dataset_query(dataset, body, timer) else: assert False, "unexpected fallthrough"
def dataset_query_view(*, dataset_name: str, timer: Timer): dataset = get_dataset(dataset_name) if http_request.method == 'GET': schema = RequestSchema.build_with_extensions(dataset.get_extensions()) return render_template( 'query.html', query_template=json.dumps( schema.generate_template(), indent=4, ), ) elif http_request.method == 'POST': body = parse_request_body(http_request) return dataset_query(dataset, body, timer) else: assert False, 'unexpected fallthrough'
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response: assert http_request.method == "POST" ensure_table_exists(dataset) return format_result( run_query( dataset, validate_request_content( body, RequestSchema.build_with_extensions(dataset.get_extensions(), HTTPRequestSettings), timer, dataset, http_request.referrer, ), timer, ))
def dataset_query_view(*, dataset: Dataset, timer: Timer) -> Union[Response, str]: if http_request.method == "GET": schema = RequestSchema.build_with_extensions( dataset.get_default_entity().get_extensions(), HTTPRequestSettings, Language.LEGACY, ) return render_template( "query.html", query_template=json.dumps(schema.generate_template(), indent=4,), ) elif http_request.method == "POST": body = parse_request_body(http_request) _trace_transaction(dataset) return dataset_query(dataset, body, timer, Language.LEGACY) else: assert False, "unexpected fallthrough"
def build_request(body, schema: RequestSchema, timer: Timer, dataset: Dataset, referrer: str) -> Request: with sentry_sdk.start_span(description="build_request", op="validate") as span: try: request = schema.validate(body, dataset, referrer) except (InvalidJsonRequestException, InvalidQueryException) as exception: record_invalid_request(timer, referrer) raise exception except Exception as exception: record_error_building_request(timer, referrer) raise exception span.set_data("snuba_query", request.body) timer.mark("validate_schema") return request
def test_build_request(body: MutableMapping[str, Any], language: Language, condition: Expression) -> None: dataset = get_dataset("events") entity = dataset.get_default_entity() schema = RequestSchema.build_with_extensions( entity.get_extensions(), HTTPRequestSettings, language, ) request = build_request( body, parse_legacy_query if language == Language.LEGACY else partial( parse_snql_query, []), HTTPRequestSettings, schema, dataset, Timer("test"), "my_request", ) expected_query = Query( from_clause=Entity(EntityKey.EVENTS, entity.get_data_model()), selected_columns=[ SelectedExpression( name="time", expression=Column(alias="_snuba_time", table_name=None, column_name="time"), ), SelectedExpression("count", FunctionCall("_snuba_count", "count", tuple())), ], condition=condition, groupby=[Column("_snuba_time", None, "time")], limit=1000, granularity=60, ) assert request.referrer == "my_request" assert dict(request.body) == body status, differences = request.query.equals(expected_query) assert status == True, f"Query mismatch: {differences}"
def dataset_query(dataset, body, timer): assert http_request.method == 'POST' ensure_table_exists(dataset) schema = RequestSchema.build_with_extensions(dataset.get_extensions()) query_result = parse_and_run_query( dataset, validate_request_content(body, schema, timer), timer, ) def json_default(obj): if isinstance(obj, datetime): return obj.isoformat() elif isinstance(obj, UUID): return str(obj) return obj return (json.dumps(query_result.result, for_json=True, default=json_default), query_result.status, { 'Content-Type': 'application/json' })
def dataset_query( dataset: Dataset, body: MutableMapping[str, Any], timer: Timer ) -> Response: assert http_request.method == "POST" referrer = http_request.referrer or "<unknown>" # mypy # Try to detect if new requests are being sent to the api # after the shutdown command has been issued, and if so # how long after. I don't want to do a disk check for # every query, so randomly sample until the shutdown file # is detected, and then log everything if IS_SHUTTING_DOWN or random.random() < 0.05: if IS_SHUTTING_DOWN or check_down_file_exists(): tags = {"dataset": get_dataset_name(dataset)} metrics.increment("post.shutdown.query", tags=tags) diff = time.time() - (shutdown_time() or 0.0) # this should never be None metrics.timing("post.shutdown.query.delay", diff, tags=tags) with sentry_sdk.start_span(description="build_schema", op="validate"): schema = RequestSchema.build(HTTPQuerySettings) request = build_request( body, parse_snql_query, HTTPQuerySettings, schema, dataset, timer, referrer ) try: result = parse_and_run_query(dataset, request, timer) except QueryException as exception: status = 500 details: Mapping[str, Any] cause = exception.__cause__ if isinstance(cause, RateLimitExceeded): status = 429 details = { "type": "rate-limited", "message": str(cause), } logger.warning( str(cause), exc_info=True, ) elif isinstance(cause, ClickhouseError): status = get_http_status_for_clickhouse_error(cause) details = { "type": "clickhouse", "message": str(cause), "code": cause.code, } elif isinstance(cause, QueryTooLongException): status = 400 details = {"type": "query-too-long", "message": str(cause)} elif isinstance(cause, Exception): details = { "type": "unknown", "message": str(cause), } else: raise # exception should have been chained return Response( json.dumps( {"error": details, "timing": timer.for_json(), **exception.extra} ), status, {"Content-Type": "application/json"}, ) payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()} if settings.STATS_IN_RESPONSE or request.query_settings.get_debug(): payload.update(result.extra) return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
def test_span_id_promotion(entity: Entity, expected_table_name: str) -> None: """In order to save space in the contexts column and provide faster query performance, we promote span_id to a proper column and don't store it in the actual contexts object in the DB. The client however, still queries by `contexts[trace.span_id]` and expects that it is a hex string rather than a 64 bit uint (which is what we store it as) This test makes sure that our query pipeline will do the proper column promotion and conversion """ dataset_name = "discover" # The client queries by contexts[trace.span_id] even though that's not how we store it query_str = f"""MATCH (discover) SELECT contexts[trace.span_id] WHERE timestamp >= toDateTime('2021-07-25T15:02:10') AND timestamp < toDateTime('2021-07-26T15:02:10') AND contexts[trace.span_id] = '{span_id_hex}' AND project_id IN tuple(5492900) """ # ----- create the request object as if it came in through our API ----- query_body = { "query": query_str, "debug": True, "dataset": dataset_name, "turbo": False, "consistent": False, } dataset = get_dataset(dataset_name) schema = RequestSchema.build(HTTPQuerySettings) request = build_request( query_body, parse_snql_query, HTTPQuerySettings, schema, dataset, Timer(name="bloop"), "some_referrer", ) # -------------------------------------------------------------------- def query_verifier( query: Union[Query, CompositeQuery[Table]], settings: QuerySettings, reader: Reader, ) -> QueryResult: assert isinstance(query, Query) # in local and CI there's a table name difference # errors_local vs errors_dist and discover_local vs discover_dist # so we check using `in` instead of `==` assert expected_table_name in query.get_from_clause().table_name assert query.get_selected_columns() == [ SelectedExpression( name="contexts[trace.span_id]", # the select converts the span_id into a lowecase hex string expression=FunctionCall( "_snuba_contexts[trace.span_id]", "lower", (FunctionCall(None, "hex", (Column(None, None, "span_id"), )), ), ), ) ] class SpanIdVerifier(NoopVisitor): def __init__(self) -> None: self.found_span_condition = False super().__init__() def visit_function_call(self, exp: FunctionCall) -> None: if exp.function_name == "equals" and exp.parameters[ 0] == Column(None, None, "span_id"): self.found_span_condition = True # and here we can see that the hex string the client queried us with # has been converted to the correct uint64 assert exp.parameters[1] == Literal( None, span_id_as_uint64) return super().visit_function_call(exp) verifier = SpanIdVerifier() condition = query.get_condition() assert condition is not None condition.accept(verifier) assert verifier.found_span_condition return QueryResult( result={ "meta": [], "data": [], "totals": {} }, extra={ "stats": {}, "sql": "", "experiments": {} }, ) entity.get_query_pipeline_builder().build_execution_pipeline( request, query_verifier).execute()
def test_nullable_field_casting(entity: Entity, expected_table_name: str) -> None: dataset_name = "discover" query_str = """MATCH (discover) SELECT uniq(sdk_version) WHERE timestamp >= toDateTime('2021-07-25T15:02:10') AND timestamp < toDateTime('2021-07-26T15:02:10') AND project_id IN tuple(5492900) """ # ----- create the request object as if it came in through our API ----- query_body = { "query": query_str, "debug": True, "dataset": dataset_name, "turbo": False, "consistent": False, } dataset = get_dataset(dataset_name) schema = RequestSchema.build(HTTPQuerySettings) request = build_request( query_body, parse_snql_query, HTTPQuerySettings, schema, dataset, Timer(name="bloop"), "some_referrer", ) # -------------------------------------------------------------------- def query_verifier( query: Union[Query, CompositeQuery[Table]], settings: QuerySettings, reader: Reader, ) -> QueryResult: # The only reason this extends StringifyVisitor is because it has all the other # visit methods implemented. class NullCastingVerifier(StringifyVisitor): def __init__(self) -> None: self.sdk_version_cast_to_null = False super().__init__() def visit_function_call(self, exp: FunctionCall) -> str: if (exp.function_name == "cast" and exp.alias == "_snuba_sdk_version" and exp.parameters == ( Column(None, None, "sdk_version"), Literal(None, "Nullable(String)"), )): self.sdk_version_cast_to_null = True return super().visit_function_call(exp) for select_expr in query.get_selected_columns(): verifier = NullCastingVerifier() select_expr.expression.accept(verifier) assert verifier.sdk_version_cast_to_null return QueryResult( result={ "meta": [], "data": [], "totals": {} }, extra={ "stats": {}, "sql": "", "experiments": {} }, ) entity.get_query_pipeline_builder().build_execution_pipeline( request, query_verifier).execute()
def build_request( body: MutableMapping[str, Any], parser: Parser, settings_class: Union[Type[HTTPQuerySettings], Type[SubscriptionQuerySettings]], schema: RequestSchema, dataset: Dataset, timer: Timer, referrer: str, custom_processing: Optional[CustomProcessors] = None, ) -> Request: with sentry_sdk.start_span(description="build_request", op="validate") as span: try: request_parts = schema.validate(body) if settings_class == HTTPQuerySettings: query_settings: MutableMapping[str, bool | str] = { **request_parts.query_settings, "consistent": _consistent_override( request_parts.query_settings.get("consistent", False), referrer ), } query_settings["referrer"] = referrer # TODO: referrer probably doesn't need to be passed in, it should be from the body settings_obj: Union[ HTTPQuerySettings, SubscriptionQuerySettings ] = settings_class( **query_settings, ) elif settings_class == SubscriptionQuerySettings: settings_obj = settings_class( consistent=_consistent_override(True, referrer), ) query, snql_anonymized = parser( request_parts, settings_obj, dataset, custom_processing ) project_ids = get_object_ids_in_query_ast(query, "project_id") if project_ids is not None and len(project_ids) == 1: sentry_sdk.set_tag("snuba_project_id", project_ids.pop()) org_ids = get_object_ids_in_query_ast(query, "org_id") if org_ids is not None and len(org_ids) == 1: sentry_sdk.set_tag("snuba_org_id", org_ids.pop()) attribution_info = dict(request_parts.attribution_info) # TODO: clean this up attribution_info["app_id"] = get_app_id( request_parts.attribution_info["app_id"] ) attribution_info["referrer"] = referrer request_id = uuid.uuid4().hex request = Request( id=request_id, # TODO: Replace this with the actual query raw body. # this can have an impact on subscriptions so we need # to be careful with the change. original_body=body, query=query, attribution_info=AttributionInfo(**attribution_info), query_settings=settings_obj, snql_anonymized=snql_anonymized, ) except (InvalidJsonRequestException, InvalidQueryException) as exception: record_invalid_request(timer, referrer) raise exception except Exception as exception: record_error_building_request(timer, referrer) raise exception span.set_data( "snuba_query_parsed", repr(query).split("\n"), ) span.set_data( "snuba_query_raw", textwrap.wrap(repr(request.original_body), 100, break_long_words=False), ) sentry_sdk.add_breadcrumb( category="query_info", level="info", message="snuba_query_raw", data={ "query": textwrap.wrap( repr(request.original_body), 100, break_long_words=False ) }, ) timer.mark("validate_schema") return request
def dataset_query( dataset: Dataset, body: MutableMapping[str, Any], timer: Timer, language: Language ) -> Response: assert http_request.method == "POST" referrer = http_request.referrer or "<unknown>" # mypy if language == Language.SNQL: metrics.increment("snql.query.incoming", tags={"referrer": referrer}) parser: Callable[ [RequestParts, RequestSettings, Dataset], Union[Query, CompositeQuery[Entity]], ] = partial(parse_snql_query, []) else: parser = parse_legacy_query with sentry_sdk.start_span(description="build_schema", op="validate"): schema = RequestSchema.build_with_extensions( dataset.get_default_entity().get_extensions(), HTTPRequestSettings, language ) request = build_request( body, parser, HTTPRequestSettings, schema, dataset, timer, referrer ) try: result = parse_and_run_query(dataset, request, timer) # Some metrics to track the adoption of SnQL query_type = "simple" if language == Language.SNQL: if isinstance(request.query, CompositeQuery): if isinstance(request.query.get_from_clause(), JoinClause): query_type = "join" else: query_type = "subquery" metrics.increment( "snql.query.success", tags={"referrer": referrer, "type": query_type} ) except QueryException as exception: status = 500 details: Mapping[str, Any] cause = exception.__cause__ if isinstance(cause, RateLimitExceeded): status = 429 details = { "type": "rate-limited", "message": "rate limit exceeded", } elif isinstance(cause, ClickhouseError): details = { "type": "clickhouse", "message": str(cause), "code": cause.code, } elif isinstance(cause, Exception): details = { "type": "unknown", "message": str(cause), } else: raise # exception should have been chained if language == Language.SNQL: metrics.increment( "snql.query.failed", tags={"referrer": referrer, "status": f"{status}"}, ) return Response( json.dumps( {"error": details, "timing": timer.for_json(), **exception.extra} ), status, {"Content-Type": "application/json"}, ) payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()} if settings.STATS_IN_RESPONSE or request.settings.get_debug(): payload.update(result.extra) return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
def test_tags_hashmap_optimization() -> None: entity = get_entity(EntityKey.DISCOVER) dataset_name = "discover" query_str = """ MATCH (discover) SELECT count() AS count WHERE timestamp >= toDateTime('2021-07-12T19:45:01') AND timestamp < toDateTime('2021-08-11T19:45:01') AND project_id IN tuple(300688) AND ifNull(tags[duration_group], '') != '' AND ifNull(tags[duration_group], '') = '<10s' LIMIT 50 """ # ----- create the request object as if it came in through our API ----- query_body = { "query": query_str, "debug": True, "dataset": dataset_name, "turbo": False, "consistent": False, } dataset = get_dataset(dataset_name) schema = RequestSchema.build(HTTPQuerySettings) request = build_request( query_body, parse_snql_query, HTTPQuerySettings, schema, dataset, Timer(name="bloop"), "some_referrer", ) # -------------------------------------------------------------------- def query_verifier(query: Query, settings: QuerySettings, reader: Reader) -> None: class ConditionVisitor(NoopVisitor): def __init__(self) -> None: self.found_hashmap_condition = False def visit_function_call(self, exp: FunctionCall) -> None: assert exp.function_name != "arrayElement" if ( exp.function_name == "has" and isinstance(exp.parameters[0], Column) and exp.parameters[0].column_name == "_tags_hash_map" ): self.found_hashmap_condition = True return super().visit_function_call(exp) visitor = ConditionVisitor() query.get_condition().accept(visitor) assert visitor.found_hashmap_condition entity.get_query_pipeline_builder().build_execution_pipeline( request, query_verifier ).execute()