def test_or_query(self): result = get_filter( "trend_percentage():>0% OR trend_percentage():<100%", {"aliases": self.improved_aliases}, ) assert result.having == [[ [ "or", [["less", ["trend_percentage", 1.0]], ["greater", ["trend_percentage", 0.0]]], ], "=", 1, ]] result = get_filter( "trend_percentage():>0% OR trend_percentage():<100%", {"aliases": self.regression_aliases}, ) assert result.having == [[ [ "or", [["greater", ["trend_percentage", 1.0]], ["less", ["trend_percentage", 2.0]]], ], "=", 1, ]]
def test_greater_than(self): result = get_filter("trend_difference():>=0", {"aliases": self.improved_aliases}) assert result.having == [["trend_difference", "<=", 0.0]] result = get_filter("trend_difference():>=0", {"aliases": self.regression_aliases}) assert result.having == [["trend_difference", ">=", 0.0]]
def test_negation(self): result = get_filter("!trend_difference():>=0", {"aliases": self.improved_aliases}) assert result.having == [["trend_difference", ">", 0.0]] result = get_filter("!trend_difference():>=0", {"aliases": self.regression_aliases}) assert result.having == [["trend_difference", "<", 0.0]]
def test_confidence(self): result = get_filter("confidence():>6", {"aliases": self.improved_aliases}) assert result.having == [["t_test", ">", 6.0]] result = get_filter("confidence():>6", {"aliases": self.regression_aliases}) assert result.having == [["t_test", "<", -6.0]]
def get_timeseries_snuba_filter(selected_columns, query, params, rollup, default_count=True): snuba_filter = get_filter(query, params) if not snuba_filter.start and not snuba_filter.end: raise InvalidSearchQuery( "Cannot get timeseries result without a start and end.") snuba_filter.update_with( resolve_field_list(selected_columns, snuba_filter, auto_fields=False)) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases(snuba_filter) if not snuba_filter.aggregations: raise InvalidSearchQuery( "Cannot get timeseries result with no aggregation.") # Change the alias of the first aggregation to count. This ensures compatibility # with other parts of the timeseries endpoint expectations if len(snuba_filter.aggregations) == 1 and default_count: snuba_filter.aggregations[0][2] = "count" return snuba_filter, translated_columns
def _get_events_snuba(self, request, group, environments, query, tags, start, end): default_end = timezone.now() default_start = default_end - timedelta(days=90) params = { "group_ids": [group.id], "project_id": [group.project_id], "organization_id": group.project.organization_id, "start": start if start else default_start, "end": end if end else default_end, } direct_hit_resp = get_direct_hit_response(request, query, params, "api.group-events") if direct_hit_resp: return direct_hit_resp if environments: params["environment"] = [env.name for env in environments] full = request.GET.get("full", False) try: snuba_filter = get_filter(request.GET.get("query", None), params) except InvalidSearchQuery as e: raise ParseError(detail=str(e)) snuba_filter.conditions.append(["event.type", "!=", "transaction"]) data_fn = partial(eventstore.get_events, referrer="api.group-events", filter=snuba_filter) serializer = EventSerializer() if full else SimpleEventSerializer() return self.paginate( request=request, on_results=lambda results: serialize(results, request.user, serializer), paginator=GenericOffsetPaginator(data_fn=data_fn), )
def test_simple(self): result = get_filter("trend_percentage():>0% trend_difference():>0", {"aliases": self.improved_aliases}) assert result.having == [ ["trend_percentage", "<", 1.0], ["trend_difference", "<", 0.0], ] result = get_filter("trend_percentage():>0% trend_difference():>0", {"aliases": self.regression_aliases}) assert result.having == [ ["trend_percentage", ">", 1.0], ["trend_difference", ">", 0.0], ]
def test_and_query(self): result = get_filter( "trend_percentage():>0% AND trend_percentage():<100%", {"aliases": self.improved_aliases}, ) assert result.having == [["trend_percentage", "<", 1.0], ["trend_percentage", ">", 0.0]] result = get_filter( "trend_percentage():>0% AND trend_percentage():<100%", {"aliases": self.regression_aliases}, ) assert result.having == [["trend_percentage", ">", 1.0], ["trend_percentage", "<", 2.0]]
def build_snuba_filter( self, query: str, environment: Optional[Environment], params: Optional[Mapping[str, Any]] = None, ) -> Filter: resolve_func = resolve_column(Dataset(self.dataset.value)) aggregations = [self.aggregate] # This aggregation is added to return the total number of sessions in crash # rate alerts that is used to identify if we are below a general minimum alert threshold count_col = re.search(r"(sessions|users)", self.aggregate) if not count_col: raise UnsupportedQuerySubscription( "Only crash free percentage queries are supported for subscriptions" "over the sessions dataset" ) count_col_matched = count_col.group() aggregations += [f"identity({count_col_matched}) AS {CRASH_RATE_ALERT_SESSION_COUNT_ALIAS}"] functions_acl = ["identity"] snuba_filter = get_filter(query, params=params) snuba_filter.update_with( resolve_field_list( aggregations, snuba_filter, auto_fields=False, functions_acl=functions_acl ) ) snuba_filter = resolve_snuba_aliases(snuba_filter, resolve_func)[0] if environment: snuba_filter.conditions.append(["environment", "=", environment.name]) return snuba_filter
def get_snuba_query_args_legacy( self, request: Request, organization: Organization ) -> Dict[ str, Union[ Optional[datetime], Sequence[Sequence[Union[str, str, Any]]], Optional[Dict[str, Sequence[int]]], ], ]: params = self.get_filter_params(request, organization) query = request.GET.get("query") try: _filter = get_filter(query, params) except InvalidSearchQuery as e: raise ParseError(detail=str(e)) snuba_args = { "start": _filter.start, "end": _filter.end, "conditions": _filter.conditions, "filter_keys": _filter.filter_keys, } return snuba_args
def validate(self, data): query = {} query_keys = [ "environment", "query", "fields", "conditions", "aggregations", "range", "start", "end", "orderby", "limit", "widths", "yAxis", "display", "topEvents", ] for key in query_keys: if data.get(key) is not None: query[key] = data[key] version = data.get("version", 1) self.validate_version_fields(version, query) if version == 2: if len(query["fields"]) < 1: raise serializers.ValidationError( "You must include at least one field.") if data["projects"] == ALL_ACCESS_PROJECTS: data["projects"] = [] query["all_projects"] = True if "query" in query: try: get_filter(query["query"], self.context["params"]) except InvalidSearchQuery as err: raise serializers.ValidationError( f"Cannot save invalid query: {err}") return { "name": data["name"], "project_ids": data["projects"], "query": query, "version": version, }
def get_snuba_filter(self, request, organization, params=None): if params is None: params = self.get_snuba_params(request, organization) query = request.GET.get("query") try: return get_filter(query, params) except InvalidSearchQuery as e: raise ParseError(detail=str(e))
def query_tag_data( params: Mapping[str, str], referrer: str, filter_query: Optional[str] = None, aggregate_column: Optional[str] = None, ) -> Optional[Dict]: """ Fetch general data about all the transactions with this transaction name to feed into the facet query :return: Returns the row with aggregate and count if the query was successful Returns None if query was not successful which causes the endpoint to return early """ with sentry_sdk.start_span(op="discover.discover", description="facets.filter_transform") as span: span.set_data("query", filter_query) snuba_filter = get_filter(filter_query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = discover.resolve_discover_aliases( snuba_filter) translated_aggregate_column = discover.resolve_discover_column( aggregate_column) with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"): # Get the average and count to use to filter the next request to facets tag_data = discover.query( selected_columns=[ "count()", f"avg({aggregate_column}) as aggregate", f"max({aggregate_column}) as max", f"min({aggregate_column}) as min", ], conditions=[ [translated_aggregate_column, "IS NOT NULL", None], ], query=filter_query, params=params, orderby=["-count"], referrer=f"{referrer}.all_transactions", limit=1, ) if len(tag_data["data"]) != 1: return None counts = [r["count"] for r in tag_data["data"]] aggregates = [r["aggregate"] for r in tag_data["data"]] # Return early to avoid doing more queries with 0 count transactions or aggregates for columns that don't exist if counts[0] == 0 or aggregates[0] is None: return None if not tag_data["data"][0]: return None return tag_data["data"][0]
def __init__(self, query, params, allow_minute_resolution=False): self.query = query.get("query", "") self.raw_fields = raw_fields = query.getlist("field", []) self.raw_groupby = raw_groupby = query.getlist("groupBy", []) if len(raw_fields) == 0: raise InvalidField('Request is missing a "field"') self.fields = {} for key in raw_fields: if key not in COLUMN_MAP: raise InvalidField(f'Invalid field: "{key}"') self.fields[key] = COLUMN_MAP[key] self.groupby = [] for key in raw_groupby: if key not in GROUPBY_MAP: raise InvalidField(f'Invalid groupBy: "{key}"') self.groupby.append(GROUPBY_MAP[key]) start, end, rollup = get_constrained_date_range( query, allow_minute_resolution) self.rollup = rollup self.start = start self.end = end self.params = params query_columns = set() for field in self.fields.values(): query_columns.update(field.get_snuba_columns(raw_groupby)) for groupby in self.groupby: query_columns.update(groupby.get_snuba_columns()) self.query_columns = list(query_columns) query_groupby = set() for groupby in self.groupby: query_groupby.update(groupby.get_snuba_groupby()) self.query_groupby = list(query_groupby) # the `params` are: # project_id, organization_id, environment; # also: start, end; but we got those ourselves. snuba_filter = get_filter(self.query, params) # this makes sure that literals in complex queries are properly quoted, # and unknown fields are raised as errors conditions = [ resolve_condition(c, resolve_column) for c in snuba_filter.conditions ] self.aggregations = snuba_filter.aggregations self.conditions = conditions self.filter_keys = snuba_filter.filter_keys
def build_snuba_filter(dataset, query, aggregate, environment, event_types, params=None): resolve_func = ( resolve_column(Dataset.Events) if dataset == QueryDatasets.EVENTS else resolve_column(Dataset.Transactions) ) query = apply_dataset_query_conditions(dataset, query, event_types) snuba_filter = get_filter(query, params=params) snuba_filter.update_with(resolve_field_list([aggregate], snuba_filter, auto_fields=False)) snuba_filter = resolve_snuba_aliases(snuba_filter, resolve_func)[0] if snuba_filter.group_ids: snuba_filter.conditions.append(["group_id", "IN", list(map(int, snuba_filter.group_ids))]) if environment: snuba_filter.conditions.append(["environment", "=", environment.name]) return snuba_filter
def get_snuba_query_args_legacy(self, request, organization): params = self.get_filter_params(request, organization) query = request.GET.get("query") try: _filter = get_filter(query, params) except InvalidSearchQuery as e: raise ParseError(detail=str(e)) snuba_args = { "start": _filter.start, "end": _filter.end, "conditions": _filter.conditions, "filter_keys": _filter.filter_keys, } return snuba_args
def get_direct_hit_response(request, query, snuba_params, referrer): """ Checks whether a query is a direct hit for an event, and if so returns a response. Otherwise returns None """ event_id = normalize_event_id(query) if event_id: snuba_filter = get_filter(query=f"id:{event_id}", params=snuba_params) snuba_filter.conditions.append(["event.type", "!=", "transaction"]) results = eventstore.get_events(referrer=referrer, filter=snuba_filter) if len(results) == 1: response = Response(serialize(results, request.user)) response["X-Sentry-Direct-Hit"] = "1" return response
def validate(self, data): if not data.get("id"): keys = set(data.keys()) if self.required_for_create - keys: raise serializers.ValidationError( { "fields": "fields are required during creation.", "conditions": "conditions are required during creation.", } ) # Validate the query that would be created when run. conditions = self._get_attr(data, "conditions", "") fields = self._get_attr(data, "fields", []).copy() orderby = self._get_attr(data, "orderby", "") equations, fields = categorize_columns(fields) if equations is not None: resolved_equations, _ = resolve_equation_list(equations, fields) else: resolved_equations = [] try: # When using the eps/epm functions, they require an interval argument # or to provide the start/end so that the interval can be computed. # This uses a hard coded start/end to ensure the validation succeeds # since the values themselves don't matter. params = { "start": datetime.now() - timedelta(days=1), "end": datetime.now(), "project_id": [p.id for p in self.context.get("projects")], "organization_id": self.context.get("organization").id, } snuba_filter = get_filter(conditions, params=params) except InvalidSearchQuery as err: raise serializers.ValidationError({"conditions": f"Invalid conditions: {err}"}) if orderby: snuba_filter.orderby = get_function_alias(orderby) try: resolve_field_list(fields, snuba_filter, resolved_equations=resolved_equations) except InvalidSearchQuery as err: raise serializers.ValidationError({"fields": f"Invalid fields: {err}"}) return data
def build_snuba_filter( self, query: str, environment: Optional[Environment], params: Optional[Mapping[str, Any]] = None, ) -> Filter: snuba_filter = get_filter(query, params=params) conditions = copy(snuba_filter.conditions) session_status_tag_values = resolve_many_weak(["crashed", "init"]) snuba_filter.update_with({ "aggregations": [[f"{self.aggregation_func}(value)", None, "value"]], "conditions": [ ["metric_id", "=", resolve(self.metric_key.value)], [self.session_status, "IN", session_status_tag_values], ], "groupby": self.get_query_groupby(), "rollup": self.get_granularity(), }) if environment: snuba_filter.conditions.append([ resolve_tag_key("environment"), "=", resolve_weak(environment.name) ]) if query and len(conditions) > 0: release_conditions = [ condition for condition in conditions if condition[0] == "release" ] for release_condition in release_conditions: snuba_filter.conditions.append([ resolve_tag_key(release_condition[0]), release_condition[1], resolve_weak(release_condition[2]), ]) return snuba_filter
def build_snuba_filter(dataset, query, aggregate, environment, event_types, params=None): resolve_func = { QueryDatasets.EVENTS: resolve_column(Dataset.Events), QueryDatasets.SESSIONS: resolve_column(Dataset.Sessions), QueryDatasets.TRANSACTIONS: resolve_column(Dataset.Transactions), }[dataset] functions_acl = None aggregations = [aggregate] if dataset == QueryDatasets.SESSIONS: # This aggregation is added to return the total number of sessions in crash # rate alerts that is used to identify if we are below a general minimum alert threshold count_col = re.search(r"(sessions|users)", aggregate) count_col_matched = count_col.group() aggregations += [ f"identity({count_col_matched}) AS {CRASH_RATE_ALERT_SESSION_COUNT_ALIAS}" ] functions_acl = ["identity"] query = apply_dataset_query_conditions(dataset, query, event_types) snuba_filter = get_filter(query, params=params) snuba_filter.update_with( resolve_field_list(aggregations, snuba_filter, auto_fields=False, functions_acl=functions_acl)) snuba_filter = resolve_snuba_aliases(snuba_filter, resolve_func)[0] if snuba_filter.group_ids: snuba_filter.conditions.append( ["group_id", "IN", list(map(int, snuba_filter.group_ids))]) if environment: snuba_filter.conditions.append(["environment", "=", environment.name]) return snuba_filter
def build_snuba_filter( self, query: str, environment: Optional[Environment], params: Optional[Mapping[str, Any]] = None, ) -> Filter: resolve_func = resolve_column(Dataset(self.dataset.value)) query = apply_dataset_query_conditions(QueryDatasets(self.dataset), query, self.event_types) snuba_filter = get_filter(query, params=params) snuba_filter.update_with( resolve_field_list([self.aggregate], snuba_filter, auto_fields=False) ) snuba_filter = resolve_snuba_aliases(snuba_filter, resolve_func)[0] if snuba_filter.group_ids: snuba_filter.conditions.append( ["group_id", "IN", list(map(int, snuba_filter.group_ids))] ) if environment: snuba_filter.conditions.append(["environment", "=", environment.name]) return snuba_filter
def query_facet_performance( params: Mapping[str, str], tag_data: Mapping[str, Any], referrer: str, aggregate_column: Optional[str] = None, filter_query: Optional[str] = None, orderby: Optional[str] = None, limit: Optional[int] = None, offset: Optional[int] = None, all_tag_keys: Optional[bool] = None, tag_key: Optional[bool] = None, ) -> Dict: with sentry_sdk.start_span( op="discover.discover", description="facets.filter_transform" ) as span: span.set_data("query", filter_query) snuba_filter = get_filter(filter_query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = discover.resolve_discover_aliases(snuba_filter) translated_aggregate_column = discover.resolve_discover_column(aggregate_column) # Aggregate (avg) and count of all transactions for this query transaction_aggregate = tag_data["aggregate"] # Dynamically sample so at least 50000 transactions are selected sample_start_count = 50000 transaction_count = tag_data["count"] sampling_enabled = transaction_count > sample_start_count # log-e growth starting at 50,000 target_sample = max( sample_start_count * (math.log(transaction_count) - (math.log(sample_start_count) - 1)), transaction_count, ) dynamic_sample_rate = 0 if transaction_count <= 0 else (target_sample / transaction_count) sample_rate = min(max(dynamic_sample_rate, 0), 1) if sampling_enabled else None frequency_sample_rate = sample_rate if sample_rate else 1 # Exclude tags that have high cardinality are are generally unrelated to performance excluded_tags = [ "tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span", "project", "browser", "celery_task_id", "url"], ] with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"): span.set_data("sample_rate", sample_rate) span.set_data("target_sample", target_sample) conditions = snuba_filter.conditions aggregate_comparison = transaction_aggregate * 1.005 if transaction_aggregate else 0 having = [excluded_tags] if not all_tag_keys and not tag_key: having.append(["aggregate", ">", aggregate_comparison]) resolved_orderby = [] if orderby is None else orderby conditions.append([translated_aggregate_column, "IS NOT NULL", None]) if tag_key: conditions.append(["tags_key", "IN", [tag_key]]) tag_key_limit = limit if tag_key else 1 tag_selected_columns = [ [ "divide", [ ["sum", [["minus", [translated_aggregate_column, transaction_aggregate]]]], frequency_sample_rate, ], "sumdelta", ], ["count", [], "count"], [ "divide", [["divide", [["count", []], frequency_sample_rate]], transaction_count], "frequency", ], ["divide", ["aggregate", transaction_aggregate], "comparison"], ["avg", [translated_aggregate_column], "aggregate"], ] limitby = [tag_key_limit, "tags_key"] if not tag_key else None results = discover.raw_query( selected_columns=tag_selected_columns, conditions=conditions, start=snuba_filter.start, end=snuba_filter.end, filter_keys=snuba_filter.filter_keys, orderby=resolved_orderby + ["tags_key", "tags_value"], groupby=["tags_key", "tags_value"], having=having, dataset=Dataset.Discover, referrer=f"{referrer}.tag_values".format(referrer, "tag_values"), sample=sample_rate, turbo=sample_rate is not None, limitby=limitby, limit=limit, offset=offset, ) results = discover.transform_results(results, {}, translated_columns, snuba_filter) return results
def get_performance_facets( query, params, orderby=None, aggregate_column="duration", aggregate_function="avg", limit=20, offset=None, referrer=None, ): """ High-level API for getting 'facet map' results for performance data Performance facets are high frequency tags and the aggregate duration of their most frequent values query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ with sentry_sdk.start_span(op="discover.discover", description="facets.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"): # Get the most relevant tag keys key_names = raw_query( aggregations=[ [aggregate_function, aggregate_column, "aggregate"], ["count", None, "count"], ], start=snuba_filter.start, end=snuba_filter.end, conditions=snuba_filter.conditions, filter_keys=snuba_filter.filter_keys, orderby=["-count"], dataset=Dataset.Discover, referrer="{}.{}".format(referrer, "all_transactions"), ) counts = [r["count"] for r in key_names["data"]] aggregates = [r["aggregate"] for r in key_names["data"]] # Return early to avoid doing more queries with 0 count transactions or aggregates for columns that dont exist if len(counts) != 1 or counts[0] == 0 or aggregates[0] is None: return [] results = [] snuba_filter.conditions.append([aggregate_column, "IS NOT NULL", None]) # Aggregate for transaction transaction_aggregate = key_names["data"][0]["aggregate"] # Dynamically sample so at least 10000 transactions are selected transaction_count = key_names["data"][0]["count"] sampling_enabled = transaction_count > 50000 # Log growth starting at 50,000 target_sample = 50000 * (math.log(transaction_count, 10) - 3) dynamic_sample_rate = 0 if transaction_count <= 0 else (target_sample / transaction_count) sample_rate = min(max(dynamic_sample_rate, 0), 1) if sampling_enabled else None frequency_sample_rate = sample_rate if sample_rate else 1 excluded_tags = [ "tags_key", "NOT IN", [ "trace", "trace.ctx", "trace.span", "project", "browser", "celery_task_id" ], ] with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"): conditions = snuba_filter.conditions aggregate_comparison = transaction_aggregate * 1.01 if transaction_aggregate else 0 having = [excluded_tags] if orderby and orderby in ("sumdelta", "-sumdelta", "aggregate", "-aggregate"): having.append(["aggregate", ">", aggregate_comparison]) if orderby is None: orderby = [] else: orderby = [orderby] tag_values = raw_query( selected_columns=[ [ "sum", [ "minus", [ aggregate_column, str(transaction_aggregate), ], ], "sumdelta", ], ], aggregations=[ [aggregate_function, aggregate_column, "aggregate"], ["count", None, "cnt"], ], conditions=conditions, start=snuba_filter.start, end=snuba_filter.end, filter_keys=snuba_filter.filter_keys, orderby=orderby + ["tags_key"], groupby=["tags_key", "tags_value"], having=having, dataset=Dataset.Discover, referrer="{}.{}".format(referrer, "tag_values"), sample=sample_rate, turbo=sample_rate is not None, limitby=[1, "tags_key"], limit=limit, offset=offset, ) results.extend([ PerformanceFacetResult( key=r["tags_key"], value=r["tags_value"], performance=float(r["aggregate"]), count=int(r["cnt"]), frequency=float( (r["cnt"] / frequency_sample_rate) / transaction_count), comparison=float(r["aggregate"] / transaction_aggregate), sumdelta=float(r["sumdelta"]), ) for r in tag_values["data"] ]) return results
def get_timeseries_snuba_filter(selected_columns, query, params): snuba_filter = get_filter(query, params) if not snuba_filter.start and not snuba_filter.end: raise InvalidSearchQuery( "Cannot get timeseries result without a start and end.") columns = [] equations = [] for column in selected_columns: if is_equation(column): equations.append(strip_equation(column)) else: columns.append(column) if len(equations) > 0: resolved_equations, updated_columns = resolve_equation_list( equations, columns, aggregates_only=True, auto_add=True) else: resolved_equations = [] updated_columns = columns # For the new apdex, we need to add project threshold config as a selected # column which means the group by for the time series won't work. # As a temporary solution, we will calculate the mean of all the project # level thresholds in the request and use the legacy apdex, user_misery # or count_miserable calculation. # TODO(snql): Alias the project_threshold_config column so it doesn't # have to be in the SELECT statement and group by to be able to use new apdex, # user_misery and count_miserable. threshold = None for agg in CONFIGURABLE_AGGREGATES: if agg not in updated_columns: continue if threshold is None: project_ids = params.get("project_id") threshold_configs = list( ProjectTransactionThreshold.objects.filter( organization_id=params["organization_id"], project_id__in=project_ids, ).values_list("threshold", flat=True)) projects_without_threshold = len(project_ids) - len( threshold_configs) threshold_configs.extend([DEFAULT_PROJECT_THRESHOLD] * projects_without_threshold) threshold = int(mean(threshold_configs)) updated_columns.remove(agg) updated_columns.append( CONFIGURABLE_AGGREGATES[agg].format(threshold=threshold)) snuba_filter.update_with( resolve_field_list(updated_columns, snuba_filter, auto_fields=False, resolved_equations=resolved_equations)) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases(snuba_filter) if not snuba_filter.aggregations: raise InvalidSearchQuery( "Cannot get timeseries result with no aggregation.") return snuba_filter, translated_columns
def validate(self, data): if not data.get("id"): keys = set(data.keys()) if self.required_for_create - keys: raise serializers.ValidationError({ "fields": "fields are required during creation.", "conditions": "conditions are required during creation.", }) # Validate the query that would be created when run. conditions = self._get_attr(data, "conditions", "") fields = self._get_attr(data, "fields", []).copy() orderby = self._get_attr(data, "orderby", "") equations, fields = categorize_columns(fields) is_table = is_table_display_type(self.context.get("displayType")) if equations is not None: try: resolved_equations, _, _ = resolve_equation_list( equations, fields, auto_add=not is_table, aggregates_only=not is_table, ) except (InvalidSearchQuery, ArithmeticError) as err: raise serializers.ValidationError( {"fields": f"Invalid fields: {err}"}) else: resolved_equations = [] try: parse_search_query(conditions) except InvalidSearchQuery as err: # We don't know if the widget that this query belongs to is an # Issue widget or Discover widget. Pass the error back to the # Widget serializer to decide if whether or not to raise this # error based on the Widget's type data["issue_query_error"] = { "conditions": [f"Invalid conditions: {err}"] } try: # When using the eps/epm functions, they require an interval argument # or to provide the start/end so that the interval can be computed. # This uses a hard coded start/end to ensure the validation succeeds # since the values themselves don't matter. params = { "start": datetime.now() - timedelta(days=1), "end": datetime.now(), "project_id": [p.id for p in self.context.get("projects")], "organization_id": self.context.get("organization").id, } snuba_filter = get_filter(conditions, params=params) except InvalidSearchQuery as err: data["discover_query_error"] = { "conditions": [f"Invalid conditions: {err}"] } return data if orderby: snuba_filter.orderby = get_function_alias(orderby) try: resolve_field_list(fields, snuba_filter, resolved_equations=resolved_equations) except InvalidSearchQuery as err: # We don't know if the widget that this query belongs to is an # Issue widget or Discover widget. Pass the error back to the # Widget serializer to decide if whether or not to raise this # error based on the Widget's type data["discover_query_error"] = {"fields": f"Invalid fields: {err}"} return data
def validate(self, data): organization = self.context["organization"] query_info = data["query_info"] # Validate the project field, if provided # A PermissionDenied error will be raised in `get_projects_by_id` if the request is invalid project_query = query_info.get("project") if project_query: get_projects_by_id = self.context["get_projects_by_id"] # Coerce the query into a set if isinstance(project_query, list): projects = get_projects_by_id(set(map(int, project_query))) else: projects = get_projects_by_id({int(project_query)}) query_info["project"] = [project.id for project in projects] # Discover Pre-processing if data["query_type"] == ExportQueryType.DISCOVER_STR: # coerce the fields into a list as needed fields = query_info.get("field", []) if not isinstance(fields, list): fields = [fields] if len(fields) > MAX_FIELDS: detail = f"You can export up to {MAX_FIELDS} fields at a time. Please delete some and try again." raise serializers.ValidationError(detail) elif len(fields) == 0: raise serializers.ValidationError("at least one field is required to export") if "query" not in query_info: detail = "query is a required to export, please pass an empty string if you don't want to set one" raise serializers.ValidationError(detail) query_info["field"] = fields if not query_info.get("project"): projects = self.context["get_projects"]() query_info["project"] = [project.id for project in projects] # make sure to fix the export start/end times to ensure consistent results try: start, end = get_date_range_from_params(query_info) except InvalidParams as e: sentry_sdk.set_tag("query.error_reason", "Invalid date params") raise serializers.ValidationError(str(e)) if "statsPeriod" in query_info: del query_info["statsPeriod"] if "statsPeriodStart" in query_info: del query_info["statsPeriodStart"] if "statsPeriodEnd" in query_info: del query_info["statsPeriodEnd"] query_info["start"] = start.isoformat() query_info["end"] = end.isoformat() # validate the query string by trying to parse it processor = DiscoverProcessor( discover_query=query_info, organization_id=organization.id, ) try: snuba_filter = get_filter(query_info["query"], processor.params) resolve_field_list( fields.copy(), snuba_filter, auto_fields=True, auto_aggregations=True, ) except InvalidSearchQuery as err: raise serializers.ValidationError(str(err)) return data
def get_facets(query, params, limit=10, referrer=None): """ High-level API for getting 'facet map' results. Facets are high frequency tags and attribute results that can be used to further refine user queries. When many projects are requested sampling will be enabled to help keep response times low. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ with sentry_sdk.start_span(op="discover.discover", description="facets.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) # Exclude tracing tags as they are noisy and generally not helpful. # TODO(markus): Tracing tags are no longer written but may still reside in DB. excluded_tags = [ "tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span", "project"] ] # Sampling keys for multi-project results as we don't need accuracy # with that much data. sample = len(snuba_filter.filter_keys["project_id"]) > 2 with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"): # Get the most frequent tag keys key_names = raw_query( aggregations=[["count", None, "count"]], start=snuba_filter.start, end=snuba_filter.end, conditions=snuba_filter.conditions, filter_keys=snuba_filter.filter_keys, orderby=["-count", "tags_key"], groupby="tags_key", having=[excluded_tags], dataset=Dataset.Discover, limit=limit, referrer=referrer, turbo=sample, ) top_tags = [r["tags_key"] for r in key_names["data"]] if not top_tags: return [] # TODO(mark) Make the sampling rate scale based on the result size and scaling factor in # sentry.options. To test the lowest acceptable sampling rate, we use 0.1 which # is equivalent to turbo. We don't use turbo though as we need to re-scale data, and # using turbo could cause results to be wrong if the value of turbo is changed in snuba. sampling_enabled = options.get("discover2.tags_facet_enable_sampling") sample_rate = 0.1 if (sampling_enabled and key_names["data"][0]["count"] > 10000) else None # Rescale the results if we're sampling multiplier = 1 / sample_rate if sample_rate is not None else 1 fetch_projects = False if len(params.get("project_id", [])) > 1: if len(top_tags) == limit: top_tags.pop() fetch_projects = True results = [] if fetch_projects: with sentry_sdk.start_span(op="discover.discover", description="facets.projects"): project_values = raw_query( aggregations=[["count", None, "count"]], start=snuba_filter.start, end=snuba_filter.end, conditions=snuba_filter.conditions, filter_keys=snuba_filter.filter_keys, groupby="project_id", orderby="-count", dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, # Ensures Snuba will not apply FINAL turbo=sample_rate is not None, ) results.extend([ FacetResult("project", r["project_id"], int(r["count"]) * multiplier) for r in project_values["data"] ]) # Get tag counts for our top tags. Fetching them individually # allows snuba to leverage promoted tags better and enables us to get # the value count we want. max_aggregate_tags = options.get("discover2.max_tags_to_combine") individual_tags = [] aggregate_tags = [] for i, tag in enumerate(top_tags): if tag == "environment": # Add here tags that you want to be individual individual_tags.append(tag) elif i >= len(top_tags) - max_aggregate_tags: aggregate_tags.append(tag) else: individual_tags.append(tag) with sentry_sdk.start_span(op="discover.discover", description="facets.individual_tags") as span: span.set_data("tag_count", len(individual_tags)) for tag_name in individual_tags: tag = f"tags[{tag_name}]" tag_values = raw_query( aggregations=[["count", None, "count"]], conditions=snuba_filter.conditions, start=snuba_filter.start, end=snuba_filter.end, filter_keys=snuba_filter.filter_keys, orderby=["-count"], groupby=[tag], limit=TOP_VALUES_DEFAULT_LIMIT, dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, # Ensures Snuba will not apply FINAL turbo=sample_rate is not None, ) results.extend([ FacetResult(tag_name, r[tag], int(r["count"]) * multiplier) for r in tag_values["data"] ]) if aggregate_tags: with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"): conditions = snuba_filter.conditions conditions.append(["tags_key", "IN", aggregate_tags]) tag_values = raw_query( aggregations=[["count", None, "count"]], conditions=conditions, start=snuba_filter.start, end=snuba_filter.end, filter_keys=snuba_filter.filter_keys, orderby=["tags_key", "-count"], groupby=["tags_key", "tags_value"], dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, # Ensures Snuba will not apply FINAL turbo=sample_rate is not None, limitby=[TOP_VALUES_DEFAULT_LIMIT, "tags_key"], ) results.extend([ FacetResult(r["tags_key"], r["tags_value"], int(r["count"]) * multiplier) for r in tag_values["data"] ]) return results
def query_top_tags( params: Mapping[str, str], tag_key: str, limit: int, referrer: str, orderby: Optional[List[str]], offset: Optional[int] = None, aggregate_column: Optional[str] = None, filter_query: Optional[str] = None, ) -> Optional[List[Any]]: """ Fetch counts by tag value, finding the top tag values for a tag key by a limit. :return: Returns the row with the value, the aggregate and the count if the query was successful Returns None if query was not successful which causes the endpoint to return early """ with sentry_sdk.start_span( op="discover.discover", description="facets.filter_transform" ) as span: span.set_data("query", filter_query) snuba_filter = get_filter(filter_query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = discover.resolve_discover_aliases(snuba_filter) translated_aggregate_column = discover.resolve_discover_column(aggregate_column) with sentry_sdk.start_span(op="discover.discover", description="facets.top_tags"): if not orderby: orderby = ["-count"] for i, sort in enumerate(orderby): if "frequency" in sort: # Replacing frequency as it's the same underlying data dimension, this way we don't have to modify the existing histogram query. orderby[i] = sort.replace("frequency", "count") if "tags_value" not in orderby: orderby = orderby + ["tags_value"] # Get the average and count to use to filter the next request to facets tag_data = discover.query( selected_columns=[ "count()", f"avg({aggregate_column}) as aggregate", "array_join(tags.value) as tags_value", ], query=filter_query, params=params, orderby=orderby, conditions=[ [translated_aggregate_column, "IS NOT NULL", None], ["tags_key", "IN", [tag_key]], ], functions_acl=["array_join"], referrer=f"{referrer}.top_tags", limit=limit, offset=offset, ) if len(tag_data["data"]) <= 0: return None counts = [r["count"] for r in tag_data["data"]] # Return early to avoid doing more queries with 0 count transactions or aggregates for columns that don't exist if counts[0] == 0: return None if not tag_data["data"]: return None return tag_data["data"]
def prepare_discover_query( selected_columns, query, params, orderby=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: assert ( not auto_aggregations ), "Auto aggregations cannot be used without enabling aggregate conditions" snuba_filter.having = [] with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): if orderby is not None: orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby] snuba_filter.orderby = [get_function_alias(o) for o in orderby] resolved_fields = resolve_field_list( selected_columns, snuba_filter, auto_fields=auto_fields, auto_aggregations=auto_aggregations, functions_acl=functions_acl, ) snuba_filter.update_with(resolved_fields) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. error_extra = ", and could not be automatically added" if auto_aggregations else "" if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) # Only need to iterate on arg[1] if its a list elif isinstance(arg[1], (list, tuple)): alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( "Aggregate(s) {} used in a condition but are not in the selected columns{}." .format( ", ".join(conditions_not_in_aggregations), error_extra, )) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( "Aggregate {} used in a condition but is not a selected column{}." .format( having_clause[0], error_extra, )) if conditions is not None: snuba_filter.conditions.extend(conditions) return PreparedQuery(snuba_filter, translated_columns, resolved_fields)