def test_environment_param(self): self.params["environment"] = ["", "prod"] query = QueryBuilder(Dataset.Discover, self.params, selected_columns=["environment"]) self.assertCountEqual( query.where, [ *self.default_conditions, Or([ Condition(Column("environment"), Op.IS_NULL), Condition(Column("environment"), Op.EQ, "prod"), ]), ], ) query.get_snql_query().validate() self.params["environment"] = ["dev", "prod"] query = QueryBuilder(Dataset.Discover, self.params, selected_columns=["environment"]) self.assertCountEqual( query.where, [ *self.default_conditions, Condition(Column("environment"), Op.IN, ["dev", "prod"]), ], ) query.get_snql_query().validate()
def test_environment_filter(self): query = QueryBuilder( Dataset.Discover, self.params, "environment:prod", ["environment"], ) self.assertCountEqual( query.where, [ Condition(Column("environment"), Op.EQ, "prod"), *self.default_conditions, ], ) query.get_snql_query().validate() query = QueryBuilder( Dataset.Discover, self.params, "environment:[dev, prod]", ["environment"], ) self.assertCountEqual( query.where, [ Condition(Column("environment"), Op.IN, ["dev", "prod"]), *self.default_conditions, ], ) query.get_snql_query().validate()
def test_simple_orderby(self): query = QueryBuilder( Dataset.Discover, self.params, selected_columns=["user.email", "release"], orderby=["user.email"], ) self.assertCountEqual(query.where, self.default_conditions) self.assertCountEqual( query.orderby, [ OrderBy(Function("toString", [Column("email")], "user.email"), Direction.ASC) ], ) query.get_snql_query().validate() query = QueryBuilder( Dataset.Discover, self.params, selected_columns=["user.email", "release"], orderby=["-user.email"], ) self.assertCountEqual(query.where, self.default_conditions) self.assertCountEqual( query.orderby, [ OrderBy(Function("toString", [Column("email")], "user.email"), Direction.DESC) ], ) query.get_snql_query().validate()
def test_spans_columns(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[ "array_join(spans_op)", "array_join(spans_group)", "sumArray(spans_exclusive_time)", ], functions_acl=["array_join", "sumArray"], ) self.assertCountEqual( query.columns, [ Function("arrayJoin", [Column("spans.op")], "array_join_spans_op"), Function("arrayJoin", [Column("spans.group")], "array_join_spans_group"), Function( "sum", [Function("arrayJoin", [Column("spans.exclusive_time")])], "sumArray_spans_exclusive_time", ), ], )
def test_project_alias_column_with_project_condition(self): project1 = self.create_project() project2 = self.create_project() self.params["project_id"] = [project1.id, project2.id] query = QueryBuilder( Dataset.Discover, self.params, f"project:{project1.slug}", selected_columns=["project"] ) self.assertCountEqual( query.where, [ Condition(Column("project_id"), Op.EQ, project1.id), Condition(Column("timestamp"), Op.GTE, self.start), Condition(Column("timestamp"), Op.LT, self.end), ], ) # Because of the condition on project there should only be 1 project in the transform self.assertCountEqual( query.select, [ Function( "transform", [ Column("project_id"), [project1.id], [project1.slug], "", ], "project", ) ], )
def test_project_alias_column(self): # TODO(snql-boolean): Update this to match the corresponding test in test_filter project1 = self.create_project() project2 = self.create_project() self.params["project_id"] = [project1.id, project2.id] query = QueryBuilder(Dataset.Discover, self.params, selected_columns=["project"]) self.assertCountEqual( query.where, [ Condition(Column("project_id"), Op.IN, [project1.id, project2.id]), Condition(Column("timestamp"), Op.GTE, self.start), Condition(Column("timestamp"), Op.LT, self.end), ], ) self.assertCountEqual( query.select, [ Function( "transform", [ Column("project_id"), [project1.id, project2.id], [project1.slug, project2.slug], "", ], "project", ) ], )
def wip_snql_query( selected_columns, query, params, equations=None, orderby=None, offset=None, limit=50, referrer=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): """ Replacement API for query using snql, this function is still a work in progress and is not ready for use in production """ builder = QueryBuilder( Dataset.Discover, params, query=query, selected_columns=selected_columns, orderby=orderby, use_aggregate_conditions=use_aggregate_conditions, limit=limit, ) snql_query = builder.get_snql_query() results = raw_snql_query(snql_query, referrer) return results
def test_count_if(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[ "count_if(event.type,equals,transaction)", 'count_if(event.type,notEquals,"transaction")', ], ) self.assertCountEqual(query.where, self.default_conditions) self.assertCountEqual( query.aggregates, [ Function( "countIf", [ Function("equals", [Column("type"), "transaction"]), ], "count_if_event_type_equals_transaction", ), Function( "countIf", [ Function("notEquals", [Column("type"), "transaction"]), ], "count_if_event_type_notEquals__transaction", ), ], )
def test_count_if_with_tags(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[ "count_if(foo,equals,bar)", 'count_if(foo,notEquals,"baz")', ], ) self.assertCountEqual(query.where, self.default_conditions) self.assertCountEqual( query.aggregates, [ Function( "countIf", [ Function("equals", [Column("tags[foo]"), "bar"]), ], "count_if_foo_equals_bar", ), Function( "countIf", [ Function("notEquals", [Column("tags[foo]"), "baz"]), ], "count_if_foo_notEquals__baz", ), ], )
def test_retention(self): with self.options({"system.event-retention-days": 10}): with self.assertRaises(QueryOutsideRetentionError): QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[], )
def test_array_combinator_is_private(self): with self.assertRaisesRegexp(InvalidSearchQuery, "sum: no access to private function"): QueryBuilder( Dataset.Discover, self.params, "", selected_columns=["sumArray(measurements_value)"], )
def test_array_combinator_with_non_array_arg(self): with self.assertRaisesRegexp(InvalidSearchQuery, "stuff is not a valid array column"): QueryBuilder( Dataset.Discover, self.params, "", selected_columns=["sumArray(stuff)"], functions_acl=["sumArray"], )
def test_orderby_duplicate_columns(self): query = QueryBuilder( Dataset.Discover, self.params, selected_columns=["user.email", "user.email"], orderby=["user.email"], ) self.assertCountEqual( query.orderby, [OrderBy(Column("email"), Direction.ASC)], )
def test_simple_limitby(self): query = QueryBuilder( dataset=Dataset.Discover, params=self.params, query="", selected_columns=["message"], orderby="message", limitby=("message", 1), limit=4, ) assert query.limitby == LimitBy(Column("message"), 1)
def test_sample_rate(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[ "count()", ], sample_rate=0.1, ) assert query.sample_rate == 0.1 snql_query = query.get_snql_query() snql_query.validate() assert snql_query.match.sample == 0.1
def test_turbo(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[ "count()", ], turbo=True, ) assert query.turbo.value snql_query = query.get_snql_query() snql_query.validate() assert snql_query.turbo.value
def test_project_in_condition_filters_not_in_project_filter(self): # TODO(snql-boolean): Update this to match the corresponding test in test_filter project1 = self.create_project() project2 = self.create_project() # params is assumed to be validated at this point, so this query should be invalid self.params["project_id"] = [project2.id] with self.assertRaisesRegexp( InvalidSearchQuery, re.escape( f"Invalid query. Project(s) {str(project1.slug)} do not exist or are not actively selected." ), ): QueryBuilder( Dataset.Discover, self.params, f"project:{project1.slug}", selected_columns=["environment"], )
def data_fn(offset: int, limit: int) -> Any: builder = QueryBuilder( dataset=Dataset.Discover, params=params, selected_columns=["spans_op", "count()"], array_join="spans_op", query=query, limit=limit, offset=offset, orderby="-count", ) snql_query = builder.get_snql_query() results = raw_snql_query(snql_query, "api.organization-events-span-ops") return [ SpanOp(op=row["spans_op"], count=row["count"]) for row in results["data"] ]
def test_array_join(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=["array_join(measurements_key)", "count()"], functions_acl=["array_join"], ) array_join_column = Function( "arrayJoin", [Column("measurements.key")], "array_join_measurements_key", ) self.assertCountEqual( query.columns, [array_join_column, Function("count", [], "count")]) # make sure the the array join columns are present in gropuby self.assertCountEqual(query.groupby, [array_join_column])
def test_array_combinator(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=["sumArray(measurements_value)"], functions_acl=["sumArray"], ) self.assertCountEqual( query.columns, [ Function( "sum", [Function("arrayJoin", [Column("measurements.value")])], "sumArray_measurements_value", ) ], )
def test_array_join_clause(self): query = QueryBuilder( Dataset.Discover, self.params, "", selected_columns=[ "spans_op", "count()", ], array_join="spans_op", ) self.assertCountEqual( query.columns, [ AliasedExpression(Column("spans.op"), "spans_op"), Function("count", [], "count"), ], ) assert query.array_join == Column("spans.op") query.get_snql_query().validate()
def test_project_in_condition_filters(self): # TODO(snql-boolean): Update this to match the corresponding test in test_filter project1 = self.create_project() project2 = self.create_project() self.params["project_id"] = [project1.id, project2.id] query = QueryBuilder( Dataset.Discover, self.params, f"project:{project1.slug}", selected_columns=["environment"], ) self.assertCountEqual( query.where, [ Condition(Column("project_id"), Op.EQ, project1.id), Condition(Column("timestamp"), Op.GTE, self.start), Condition(Column("timestamp"), Op.LT, self.end), ], )
def test_simple_query(self): query = QueryBuilder( Dataset.Discover, self.params, "user.email:[email protected] release:1.2.1", ["user.email", "release"], ) self.assertCountEqual( query.where, [ Condition(Column("email"), Op.EQ, "*****@*****.**"), Condition(Column("release"), Op.EQ, "1.2.1"), *self.default_conditions, ], ) self.assertCountEqual( query.select, [ Function("toString", [Column("email")], "user.email"), Column("release"), ], ) query.get_snql_query().validate()
def query_trace_data( trace_id: str, params: Mapping[str, str], use_snql: bool = False ) -> Tuple[Sequence[SnubaTransaction], Sequence[SnubaError]]: sentry_sdk.set_tag("discover.use_snql", use_snql) transaction_query: Union[QueryBuilder, discover.PreparedQuery] error_query: Union[QueryBuilder, discover.PreparedQuery] if use_snql: transaction_query = QueryBuilder( Dataset.Transactions, params, query=f"trace:{trace_id}", selected_columns=[ "id", "transaction.status", "transaction.op", "transaction.duration", "transaction", "timestamp", "project", "project.id", "trace.span", "trace.parent_span", 'to_other(trace.parent_span, "", 0, 1) AS root', ], # We want to guarantee at least getting the root, and hopefully events near it with timestamp # id is just for consistent results orderby=["-root", "timestamp", "id"], limit=MAX_TRACE_SIZE, ) error_query = QueryBuilder( Dataset.Events, params, query=f"trace:{trace_id}", selected_columns=[ "id", "project", "project.id", "timestamp", "trace.span", "transaction", "issue", "title", "tags[level]", ], # Don't add timestamp to this orderby as snuba will have to split the time range up and make multiple queries orderby=["id"], auto_fields=False, limit=MAX_TRACE_SIZE, ) results = bulk_snql_query( [transaction_query.get_snql_query(), error_query.get_snql_query()], referrer="api.trace-view.get-events.wip-snql", ) transformed_results = [ discover.transform_results(result, query.function_alias_map, {}, None)["data"] for result, query in zip(results, [transaction_query, error_query]) ] return cast(Sequence[SnubaTransaction], transformed_results[0]), cast(Sequence[SnubaError], transformed_results[1]) transaction_query = discover.prepare_discover_query( selected_columns=[ "id", "transaction.status", "transaction.op", "transaction.duration", "transaction", "timestamp", # project gets the slug, and project.id gets added automatically "project", "trace.span", "trace.parent_span", 'to_other(trace.parent_span, "", 0, 1) AS root', ], # We want to guarantee at least getting the root, and hopefully events near it with timestamp # id is just for consistent results orderby=["-root", "timestamp", "id"], params=params, query=f"event.type:transaction trace:{trace_id}", ) error_query = discover.prepare_discover_query( selected_columns=[ "id", "project", "timestamp", "trace.span", "transaction", "issue", "title", "tags[level]", ], # Don't add timestamp to this orderby as snuba will have to split the time range up and make multiple queries orderby=["id"], params=params, query=f"!event.type:transaction trace:{trace_id}", auto_fields=False, ) snuba_params = [ SnubaQueryParams( dataset=Dataset.Discover, start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, filter_keys=snuba_filter.filter_keys, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, having=snuba_filter.having, orderby=snuba_filter.orderby, limit=MAX_TRACE_SIZE, ) for snuba_filter in [transaction_query.filter, error_query.filter] ] results = bulk_raw_query( snuba_params, referrer="api.trace-view.get-events", ) transformed_results = [ discover.transform_results(result, query.fields["functions"], query.columns, query.filter)["data"] for result, query in zip(results, [transaction_query, error_query]) ] return cast(Sequence[SnubaTransaction], transformed_results[0]), cast(Sequence[SnubaError], transformed_results[1])
def query_suspect_span_groups( params: ParamsType, fields: List[str], query: Optional[str], span_ops: Optional[List[str]], span_groups: Optional[List[str]], direction: str, orderby: str, limit: int, offset: int, ) -> List[SuspectSpan]: suspect_span_columns = SPAN_PERFORMANCE_COLUMNS[orderby] selected_columns: List[str] = [ column for column in suspect_span_columns.suspect_op_group_columns + fields if not is_equation(column) ] + [ "array_join(spans_op)", "array_join(spans_group)", # want a single event id to fetch from nodestore for the span description "any(id)", ] equations: List[str] = [ strip_equation(column) for column in suspect_span_columns.suspect_op_group_columns + fields if is_equation(column) ] builder = QueryBuilder( dataset=Dataset.Discover, params=params, selected_columns=selected_columns, equations=equations, query=query, orderby=[direction + column for column in suspect_span_columns.suspect_op_group_sort], auto_aggregations=True, use_aggregate_conditions=True, limit=limit, offset=offset, functions_acl=["array_join", "sumArray", "percentileArray", "maxArray"], ) extra_conditions = [] if span_ops: extra_conditions.append( Condition( builder.resolve_function("array_join(spans_op)"), Op.IN, Function("tuple", span_ops), ) ) if span_groups: extra_conditions.append( Condition( builder.resolve_function("array_join(spans_group)"), Op.IN, Function("tuple", span_groups), ) ) if extra_conditions: builder.add_conditions(extra_conditions) snql_query = builder.get_snql_query() results = raw_snql_query(snql_query, "api.organization-events-spans-performance-suspects") return [ SuspectSpan( op=suspect["array_join_spans_op"], group=suspect["array_join_spans_group"], description=get_span_description( EventID(params["project_id"][0], suspect["any_id"]), span_op=suspect["array_join_spans_op"], span_group=suspect["array_join_spans_group"], ), frequency=suspect.get("count_unique_id"), count=suspect.get("count"), avg_occurrences=suspect.get("equation[0]"), sum_exclusive_time=suspect.get("sumArray_spans_exclusive_time"), p50_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_50"), p75_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_75"), p95_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_95"), p99_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_99"), ) for suspect in results["data"] ]
def validate(self, data): organization = self.context["organization"] query_info = data["query_info"] # Validate the project field, if provided # A PermissionDenied error will be raised in `get_projects_by_id` if the request is invalid project_query = query_info.get("project") if project_query: get_projects_by_id = self.context["get_projects_by_id"] # Coerce the query into a set if isinstance(project_query, list): projects = get_projects_by_id(set(map(int, project_query))) else: projects = get_projects_by_id({int(project_query)}) query_info["project"] = [project.id for project in projects] # Discover Pre-processing if data["query_type"] == ExportQueryType.DISCOVER_STR: # coerce the fields into a list as needed base_fields = query_info.get("field", []) if not isinstance(base_fields, list): base_fields = [base_fields] equations, fields = categorize_columns(base_fields) if len(base_fields) > MAX_FIELDS: detail = f"You can export up to {MAX_FIELDS} fields at a time. Please delete some and try again." raise serializers.ValidationError(detail) elif len(base_fields) == 0: raise serializers.ValidationError("at least one field is required to export") if "query" not in query_info: detail = "query is a required to export, please pass an empty string if you don't want to set one" raise serializers.ValidationError(detail) query_info["field"] = fields query_info["equations"] = equations if not query_info.get("project"): projects = self.context["get_projects"]() query_info["project"] = [project.id for project in projects] # make sure to fix the export start/end times to ensure consistent results try: start, end = get_date_range_from_params(query_info) except InvalidParams as e: sentry_sdk.set_tag("query.error_reason", "Invalid date params") raise serializers.ValidationError(str(e)) if "statsPeriod" in query_info: del query_info["statsPeriod"] if "statsPeriodStart" in query_info: del query_info["statsPeriodStart"] if "statsPeriodEnd" in query_info: del query_info["statsPeriodEnd"] query_info["start"] = start.isoformat() query_info["end"] = end.isoformat() query_info["use_snql"] = features.has("organizations:discover-use-snql", organization) # validate the query string by trying to parse it processor = DiscoverProcessor( discover_query=query_info, organization_id=organization.id, ) try: builder = QueryBuilder( Dataset.Discover, processor.params, query=query_info["query"], selected_columns=fields.copy(), equations=equations, auto_fields=True, auto_aggregations=True, ) builder.get_snql_query() except InvalidSearchQuery as err: raise serializers.ValidationError(str(err)) return data
def query_example_transactions( params: ParamsType, query: Optional[str], direction: str, orderby: str, spans: List[Span], per_suspect: int = 5, offset: Optional[int] = None, ) -> Dict[Span, List[EventID]]: # there aren't any suspects, early return to save an empty query if not spans or per_suspect == 0: return {} orderby_columns = SPAN_PERFORMANCE_COLUMNS[orderby].suspect_example_sort selected_columns: List[str] = [ "id", "project.id", "project", "array_join(spans_op)", "array_join(spans_group)", *orderby_columns, ] builder = QueryBuilder( dataset=Dataset.Discover, params=params, selected_columns=selected_columns, query=query, orderby=[direction + column for column in orderby_columns], # we want only `per_suspect` examples for each suspect limit=len(spans) * per_suspect, offset=offset, functions_acl=[ "array_join", "sumArray", "percentileArray", "maxArray" ], ) # we are only interested in the specific op, group pairs from the suspects builder.add_conditions([ Condition( Function( "tuple", [ builder.resolve_function("array_join(spans_op)"), builder.resolve_function("array_join(spans_group)"), ], ), Op.IN, Function( "tuple", [ Function("tuple", [suspect.op, suspect.group]) for suspect in spans ], ), ), ]) if len(spans) > 1: # Hack: the limit by clause only allows columns but here we want to # do a limitby on the two array joins. For the time being, directly # do the limitby on the internal snuba name for the span group column # but this should not be relied upon in production, and if two spans # differ only by the span op, this will result in a incorrect query builder.limitby = LimitBy(Column("_snuba_array_join_spans_group"), per_suspect) snql_query = builder.get_snql_query() results = raw_snql_query( snql_query, "api.organization-events-spans-performance-examples") examples: Dict[Span, List[EventID]] = { Span(suspect.op, suspect.group): [] for suspect in spans } for example in results["data"]: key = Span(example["array_join_spans_op"], example["array_join_spans_group"]) value = EventID(example["project.id"], example["project"], example["id"]) examples[key].append(value) return examples
def query_suspect_span_groups( params: ParamsType, fields: List[str], query: Optional[str], span_ops: Optional[List[str]], span_groups: Optional[List[str]], direction: str, orderby: str, limit: int, offset: int, ) -> List[SuspectSpan]: suspect_span_columns = SPAN_PERFORMANCE_COLUMNS[orderby] selected_columns: List[str] = [ column for column in suspect_span_columns.suspect_op_group_columns + fields if not is_equation(column) ] + [ "array_join(spans_op)", "array_join(spans_group)", "count()", "count_unique(id)", ] equations: List[str] = [ strip_equation(column) for column in suspect_span_columns.suspect_op_group_columns if is_equation(column) ] # TODO: This adds all the possible fields to the query by default. However, # due to the way shards aggregate the rows, this can be slow. As an # optimization, allow the fields to be user specified to only get the # necessary aggregations. # # As part of the transition, continue to add all possible fields when its # not specified, but this should be removed in the future. if not fields: for column in SPAN_PERFORMANCE_COLUMNS.values(): for col in column.suspect_op_group_sort: if not col.startswith("equation["): selected_columns.append(col) builder = QueryBuilder( dataset=Dataset.Discover, params=params, selected_columns=selected_columns, equations=equations, query=query, orderby=[ direction + column for column in suspect_span_columns.suspect_op_group_sort ], auto_aggregations=True, use_aggregate_conditions=True, limit=limit, offset=offset, functions_acl=[ "array_join", "sumArray", "percentileArray", "maxArray" ], ) extra_conditions = [] if span_ops: extra_conditions.append( Condition( builder.resolve_function("array_join(spans_op)"), Op.IN, Function("tuple", span_ops), )) if span_groups: extra_conditions.append( Condition( builder.resolve_function("array_join(spans_group)"), Op.IN, Function("tuple", span_groups), )) if extra_conditions: builder.add_conditions(extra_conditions) snql_query = builder.get_snql_query() results = raw_snql_query( snql_query, "api.organization-events-spans-performance-suspects") return [ SuspectSpan( op=suspect["array_join_spans_op"], group=suspect["array_join_spans_group"], frequency=suspect.get("count_unique_id"), count=suspect.get("count"), avg_occurrences=suspect.get("equation[0]"), sum_exclusive_time=suspect.get("sumArray_spans_exclusive_time"), p50_exclusive_time=suspect.get( "percentileArray_spans_exclusive_time_0_50"), p75_exclusive_time=suspect.get( "percentileArray_spans_exclusive_time_0_75"), p95_exclusive_time=suspect.get( "percentileArray_spans_exclusive_time_0_95"), p99_exclusive_time=suspect.get( "percentileArray_spans_exclusive_time_0_99"), ) for suspect in results["data"] ]