def resolve_equation_list( equations: List[str], selected_columns: List[str], aggregates_only: Optional[bool] = False, auto_add: Optional[bool] = False, plain_math: Optional[bool] = False, use_snql: Optional[bool] = False, ) -> Tuple[List[JsonQueryType], List[str], List[Operation], List[bool]]: """Given a list of equation strings, resolve them to their equivalent snuba json query formats :param equations: list of equations strings that haven't been parsed yet :param selected_columns: list of public aliases from the endpoint, can be a mix of fields and aggregates :param aggregates_only: Optional parameter whether we need to enforce equations don't include fields intended for use with event-stats where fields aren't compatible since they change grouping :param: auto_add: Optional parameter that will take any fields in the equation that's missing in the selected_columns and return a new list with them added :param plain_math: Allow equations that don't include any fields or functions, disabled by default :param use_snql: Whether we're resolving for snql or not """ resolved_equations: List[JsonQueryType] = [] parsed_equations: List[ParsedEquation] = [] resolved_columns: List[str] = selected_columns[:] for index, equation in enumerate(equations): parsed_equation, fields, functions = parse_arithmetic( equation, use_snql=use_snql) if (len(fields) == 0 and len(functions) == 0) and not plain_math: raise InvalidSearchQuery( "Equations need to include a field or function") if aggregates_only and len(functions) == 0: raise InvalidSearchQuery( "Only equations on aggregate functions are supported") for field in fields: if field not in selected_columns: if auto_add: resolved_columns.append(field) else: raise InvalidSearchQuery( f"{field} used in an equation but is not a selected field" ) for function in functions: if function not in selected_columns: if auto_add: resolved_columns.append(function) else: raise InvalidSearchQuery( f"{function} used in an equation but is not a selected function" ) # We just jam everything into resolved_equations because the json format can't take arithmetic in the aggregates # field, but can do the aliases in the selected_columns field resolved_equations.append( parsed_equation.to_snuba_json(f"equation[{index}]")) # TODO: currently returning "resolved_equations" for the json syntax # once we're converted to SnQL this should only return parsed_equations parsed_equations.append( ParsedEquation(parsed_equation, len(functions) > 0)) return resolved_equations, resolved_columns, parsed_equations
def filter_by_stage( self, organization_id: int, operator: str, value, project_ids: Sequence[int] = None, environments: List[str] = None, ) -> models.QuerySet: from sentry.models import ReleaseProjectEnvironment, ReleaseStages from sentry.search.events.filter import to_list if not environments or len(environments) != 1: raise InvalidSearchQuery( "Choose a single environment to filter by release stage.") filters = { ReleaseStages.ADOPTED: Q(adopted__isnull=False, unadopted__isnull=True), ReleaseStages.REPLACED: Q(adopted__isnull=False, unadopted__isnull=False), ReleaseStages.LOW_ADOPTION: Q(adopted__isnull=True, unadopted__isnull=True), } value = to_list(value) operator_conversions = {"=": "IN", "!=": "NOT IN"} if operator in operator_conversions.keys(): operator = operator_conversions.get(operator) for stage in value: if stage not in filters: raise InvalidSearchQuery("Unsupported release.stage value.") rpes = ReleaseProjectEnvironment.objects.filter( release__organization_id=organization_id, ).select_related( "release") if project_ids: rpes = rpes.filter(project_id__in=project_ids) query = Q() if operator == "IN": for stage in value: query |= filters[stage] elif operator == "NOT IN": for stage in value: query &= ~filters[stage] qs = self.filter(id__in=Subquery( rpes.filter(query).values_list("release_id", flat=True))) return qs
def translate_transaction_status(val): if val not in SPAN_STATUS_NAME_TO_CODE: raise InvalidSearchQuery( f"Invalid value {val} for transaction.status condition. Accepted " f"values are {', '.join(SPAN_STATUS_NAME_TO_CODE.keys())}" ) return SPAN_STATUS_NAME_TO_CODE[val]
def convert_status_value(value, projects, user, environments): parsed = [] for status in value: try: parsed.append(parse_status_value(status)) except ValueError: raise InvalidSearchQuery(f"invalid status value of '{status}'") return parsed
def convert_condition_to_function(cond): function = OPERATOR_TO_FUNCTION.get(cond[1]) if not function: # It's hard to make this error more specific without exposing internals to the end user raise InvalidSearchQuery( f"Operator {cond[1]} is not a valid condition operator.") return [function, [cond[0], cond[2]]]
def resolve_metric(self, value: str) -> int: metric_id = indexer.resolve(constants.METRICS_MAP.get(value, value)) if metric_id is None: # TODO: unsure if this should be incompatible or invalid raise InvalidSearchQuery(f"Metric: {value} could not be resolved") self.builder.metric_ids.append(metric_id) return metric_id
def visit_is_filter(self, node, children): # the key is "is" here, which we don't need negation, _, _, search_value = children if search_value.raw_value.startswith("["): raise InvalidSearchQuery('"in" syntax invalid for "is" search') if search_value.raw_value not in self.is_filter_translators: raise InvalidSearchQuery( 'Invalid value for "is" search, valid values are {}'.format( sorted(self.is_filter_translators.keys()))) search_key, search_value = self.is_filter_translators[ search_value.raw_value] operator = "!=" if self.is_negated(negation) else "=" return SearchFilter(search_key, operator, search_value)
def parse_search_query(query): try: tree = event_search_grammar.parse(query) except IncompleteParseError as e: raise InvalidSearchQuery("%s %s" % ( "Parse error: %r (column %d)." % (e.expr.name, e.column()), "This is commonly caused by unmatched-parentheses. Enclose any text in double quotes.", )) return IssueSearchVisitor(allow_boolean=False).visit(tree)
def resolve_where(self, query: str) -> None: try: parsed_terms = parse_search_query(query, allow_boolean=True, params=self.params) except ParseError as e: raise InvalidSearchQuery(f"Parse error: {e.expr.name} (column {e.column():d})") for term in parsed_terms: if isinstance(term, SearchFilter): conditions = self.format_search_filter(term) if conditions: self.where.append(conditions)
def test_discover_invalid_search_query(self, emailer, mock_query): de = ExportedData.objects.create( user=self.user, organization=self.org, query_type=ExportQueryType.DISCOVER, query_info={"project": [self.project.id], "field": ["title"], "query": ""}, ) mock_query.side_effect = InvalidSearchQuery("test") with self.tasks(): assemble_download(de.id) error = emailer.call_args[1]["message"] assert error == "Invalid query. Please fix the query and try again." # unicode mock_query.side_effect = InvalidSearchQuery("\xfc") with self.tasks(): assemble_download(de.id) error = emailer.call_args[1]["message"] assert error == "Invalid query. Please fix the query and try again."
def apply(self, queryset, search_filter): value = search_filter.value.raw_value q = self.callback(value) if search_filter.operator not in ("=", "!=", "IN", "NOT IN"): raise InvalidSearchQuery( f"Operator {search_filter.operator} not valid for search {search_filter}" ) queryset_method = ( queryset.filter if search_filter.operator in EQUALITY_OPERATORS else queryset.exclude ) queryset = queryset_method(q) return queryset
def resolve_equation_list( equations: List[str], snuba_filter: eventstore.Filter ) -> Dict[str, JsonQueryType]: selected_columns = snuba_filter.selected_columns for index, equation in enumerate(equations): # only supporting 1 operation for now parsed_equation, fields = parse_arithmetic(equation, max_operators=1) for field in fields: if field not in fields: raise InvalidSearchQuery(f"{field} used in an equation but is not a selected field") selected_columns.append(parsed_equation.to_snuba_json(f"equation[{index}]")) return {"selected_columns": selected_columns}
def parse_semver(version, operator) -> Optional[SemverFilter]: """ Attempts to parse a release version using our semver syntax. version should be in format `<package_name>@<version>` or `<version>`, where package_name is a string and version is a version string matching semver format (https://semver.org/). We've slightly extended this format to allow up to 4 integers. EG - [email protected] - [email protected] - 1.2.3.4 - 1.2.3.4-alpha - 1.* """ (operator, negated) = handle_operator_negation(operator) operator = OPERATOR_TO_DJANGO[operator] version = version if "@" in version else f"{SEMVER_FAKE_PACKAGE}@{version}" parsed = parse_release_relay(version) parsed_version = parsed.get("version_parsed") if parsed_version: # Convert `pre` to always be a string prerelease = parsed_version["pre"] if parsed_version["pre"] else "" semver_filter = SemverFilter( operator, [ parsed_version["major"], parsed_version["minor"], parsed_version["patch"], parsed_version["revision"], 0 if prerelease else 1, prerelease, ], negated=negated, ) if parsed["package"] and parsed["package"] != SEMVER_FAKE_PACKAGE: semver_filter.package = parsed["package"] return semver_filter else: # Try to parse as a wildcard match package, version = version.split("@", 1) version_parts = [] if version: for part in version.split(".", 3): if part in SEMVER_WILDCARDS: break try: # We assume all ints for a wildcard match - not handling prerelease as # part of these version_parts.append(int(part)) except ValueError: raise InvalidSearchQuery(INVALID_SEMVER_MESSAGE) package = package if package and package != SEMVER_FAKE_PACKAGE else None return SemverFilter("exact", version_parts, package, negated)
def project_slug_converter( builder: QueryBuilder, search_filter: SearchFilter ) -> Optional[WhereType]: """Convert project slugs to ids and create a filter based on those. This is cause we only store project ids in clickhouse. """ value = search_filter.value.value if Op(search_filter.operator) == Op.EQ and value == "": raise InvalidSearchQuery( 'Cannot query for has:project or project:"" as every event will have a project' ) slugs = to_list(value) project_slugs: Mapping[str, int] = { slug: project_id for slug, project_id in builder.project_slugs.items() if slug in slugs } missing: List[str] = [slug for slug in slugs if slug not in project_slugs] if missing and search_filter.operator in constants.EQUALITY_OPERATORS: raise InvalidSearchQuery( f"Invalid query. Project(s) {', '.join(missing)} do not exist or are not actively selected." ) # Sorted for consistent query results project_ids = list(sorted(project_slugs.values())) if project_ids: # Create a new search filter with the correct values converted_filter = builder.convert_search_filter_to_condition( SearchFilter( SearchKey("project.id"), search_filter.operator, SearchValue(project_ids if search_filter.is_in_filter else project_ids[0]), ) ) if converted_filter: if search_filter.operator in constants.EQUALITY_OPERATORS: builder.projects_to_filter.update(project_ids) return converted_filter return None
def convert_search_filter(search_filter): if search_filter.key.name in value_converters: converter = value_converters[search_filter.key.name] new_value = converter(to_list(search_filter.value.raw_value), projects, user, environments) search_filter = search_filter._replace( value=SearchValue(new_value), operator="IN" if search_filter.operator in EQUALITY_OPERATORS else "NOT IN", ) elif isinstance(search_filter, AggregateFilter): raise InvalidSearchQuery( f"Aggregate filters ({search_filter.key.name}) are not supported in issue searches." ) return search_filter
def _error_handled_filter_converter( search_filter: SearchFilter, name: str, params: Optional[Mapping[str, Union[int, str, datetime]]], ): value = search_filter.value.value # Treat has filter as equivalent to handled if search_filter.value.raw_value == "": output = 1 if search_filter.operator == "!=" else 0 return [["isHandled", []], "=", output] # Null values and 1 are the same, and both indicate a handled error. if value in ("1", 1): return [["isHandled", []], "=", 1] if value in ("0", 0): return [["notHandled", []], "=", 1] raise InvalidSearchQuery("Invalid value for error.handled condition. Accepted values are 1, 0")
def team_key_transaction_filter(builder: QueryBuilder, search_filter: SearchFilter) -> WhereType: value = search_filter.value.value key_transaction_expr = builder.resolve_field_alias(constants.TEAM_KEY_TRANSACTION_ALIAS) if search_filter.value.raw_value == "": return Condition( key_transaction_expr, Op.NEQ if search_filter.operator == "!=" else Op.EQ, 0 ) if value in ("1", 1): return Condition(key_transaction_expr, Op.EQ, 1) if value in ("0", 0): return Condition(key_transaction_expr, Op.EQ, 0) raise InvalidSearchQuery( "Invalid value for key_transaction condition. Accepted values are 1, 0" )
def _error_unhandled_filter_converter( search_filter: SearchFilter, name: str, params: Optional[Mapping[str, Union[int, str, datetime]]], ): value = search_filter.value.value # This field is the inversion of error.handled, otherwise the logic is the same. if search_filter.value.raw_value == "": output = 0 if search_filter.operator == "!=" else 1 return [["isHandled", []], "=", output] if value in ("1", 1): return [["notHandled", []], "=", 1] if value in ("0", 0): return [["isHandled", []], "=", 1] raise InvalidSearchQuery( "Invalid value for error.unhandled condition. Accepted values are 1, 0" )
def _team_key_transaction_filter_converter( search_filter: SearchFilter, name: str, params: Optional[Mapping[str, Union[int, str, datetime]]], ): value = search_filter.value.value key_transaction_expr = FIELD_ALIASES[TEAM_KEY_TRANSACTION_ALIAS].get_field(params) if search_filter.value.raw_value == "": operator = "!=" if search_filter.operator == "!=" else "=" return [key_transaction_expr, operator, 0] if value in ("1", 1): return [key_transaction_expr, "=", 1] if value in ("0", 0): return [key_transaction_expr, "=", 0] raise InvalidSearchQuery( "Invalid value for key_transaction condition. Accepted values are 1, 0" )
def _resolve_web_vital_function( self, args: Mapping[str, Union[str, Column, SelectType, int, float]], alias: str, ) -> SelectType: column = args["column"] metric_id = args["metric_id"] quality = args["quality"].lower() if column not in [ "measurements.lcp", "measurements.fcp", "measurements.fp", "measurements.fid", "measurements.cls", ]: raise InvalidSearchQuery( "count_web_vitals only supports measurements") measurement_rating = self.builder.resolve_column("measurement_rating") quality_id = indexer.resolve(quality) if quality_id is None: return Function( # This matches the type from doing `select toTypeName(count()) ...` from clickhouse "toUInt64", [0], alias, ) return Function( "countIf", [ Column("value"), Function( "and", [ Function("equals", [measurement_rating, quality_id]), Function("equals", [Column("metric_id"), metric_id]), ], ), ], alias, )
def get_v1_results(self, request, organization): try: snuba_args = self.get_snuba_query_args_legacy(request, organization) except InvalidSearchQuery as exc: raise ParseError(detail=str(exc)) except NoProjects: return Response({"data": []}) snuba_args = self.get_field(request, snuba_args) rollup = get_rollup_from_request( request, snuba_args, default_interval=None, error=InvalidSearchQuery( "Your interval and date range would create too many results. " "Use a larger interval, or a smaller date range." ), ) result = transform_aliases_and_query( aggregations=snuba_args.get("aggregations"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), start=snuba_args.get("start"), end=snuba_args.get("end"), orderby="time", groupby=["time"], rollup=rollup, referrer="api.organization-events-stats", limit=10000, ) serializer = SnubaTSResultSerializer(organization, None, request.user) return Response( serializer.serialize( snuba.SnubaTSResult(result, snuba_args["start"], snuba_args["end"], rollup) ), status=200, )
def resolve_params(self) -> None: """Keys included as url params take precedent if same key is included in search They are also considered safe and to have had access rules applied unlike conditions from the query string. """ # start/end are required so that we can run a query in a reasonable amount of time if "start" not in self.params or "end" not in self.params: raise InvalidSearchQuery("Cannot query without a valid date range") start, end = self.params["start"], self.params["end"] # TODO: this validation should be done when we create the params dataclass instead assert isinstance(start, datetime) and isinstance( end, datetime ), "Both start and end params must be datetime objects" assert all( isinstance(project_id, int) for project_id in self.params.get("project_id", []) ), "All project id params must be ints" self.where.append(Condition(self.column("timestamp"), Op.GTE, start)) self.where.append(Condition(self.column("timestamp"), Op.LT, end)) if "project_id" in self.params: self.where.append( Condition( self.column("project_id"), Op.IN, self.params["project_id"], ) ) if "environment" in self.params: term = SearchFilter( SearchKey("environment"), "=", SearchValue(self.params["environment"]) ) condition = self._environment_filter_converter(term, "environment") if condition: self.where.append(condition)
def inbox_search( projects: Sequence[Project], environments: Optional[Sequence[Environment]] = None, limit: int = 100, cursor: Optional[Cursor] = None, count_hits: bool = False, search_filters: Optional[Sequence[SearchFilter]] = None, date_from: Optional[datetime] = None, date_to: Optional[datetime] = None, max_hits: Optional[int] = None, ) -> CursorResult: now: datetime = timezone.now() end: Optional[datetime] = None end_params: List[datetime] = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) end = end if end else now + ALLOWED_FUTURE_DELTA # We only want to search back a week at most, since that's the oldest inbox rows # can be. earliest_date = now - timedelta(days=7) start_params = [ date_from, earliest_date, get_search_filter(search_filters, "date", ">") ] start = max(_f for _f in start_params if _f) end = max([earliest_date, end]) if start >= end: return Paginator(Group.objects.none()).get_result() # Make sure search terms are valid invalid_search_terms = [ str(sf) for sf in search_filters if sf.key.name not in allowed_inbox_search_terms ] if invalid_search_terms: raise InvalidSearchQuery( f"Invalid search terms for 'inbox' search: {invalid_search_terms}") # Make sure this is an inbox search if not get_search_filter(search_filters, "for_review", "="): raise InvalidSearchQuery( "Sort key 'inbox' only supported for inbox search") if get_search_filter(search_filters, "status", "=") != GroupStatus.UNRESOLVED and get_search_filter( search_filters, "status", "IN") != [GroupStatus.UNRESOLVED]: raise InvalidSearchQuery( "Inbox search only works for 'unresolved' status") # We just filter on `GroupInbox.date_added` here, and don't filter by date # on the group. This keeps the query simpler and faster in some edge cases, # and date_added is a good enough proxy when we're using this sort. qs = GroupInbox.objects.filter( date_added__gte=start, date_added__lte=end, project__in=projects, ) if environments is not None: environment_ids: List[int] = [ environment.id for environment in environments ] qs = qs.filter(group_id__in=GroupEnvironment.objects.filter( environment_id__in=environment_ids).values_list( "group_id", flat=True).distinct()) owner_search = get_search_filter(search_filters, "assigned_or_suggested", "IN") if owner_search: qs = qs.filter( assigned_or_suggested_filter(owner_search, projects, field_filter="group_id")) paginator = DateTimePaginator(qs.order_by("date_added"), "-date_added") results = paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # We want to return groups from the endpoint, but have the cursor be related to the # GroupInbox rows. So we paginate on the GroupInbox results queryset, then fetch # the group_ids out and use them to get the actual groups. group_qs = Group.objects.filter( id__in=[r.group_id for r in results.results], project__in=projects, status=GroupStatus.UNRESOLVED, ) groups: Mapping[int, Group] = {g.id: g for g in group_qs} results.results = [ groups[r.group_id] for r in results.results if r.group_id in groups ] return results
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, max_hits=None, ): now = timezone.now() end = None end_params = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA metrics.incr("snuba.search.postgres_only") # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max([ _f for _f in [retention_window_start, now - timedelta(days=90)] if _f ]) start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max([_f for _f in start_params if _f]) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result # This search is specific to Inbox. If we're using inbox sort and only querying # postgres then we can use this sort method. Otherwise if we need to go to Snuba, # fail. if (sort_by == "inbox" and get_search_filter(search_filters, "for_review", "=") # This handles tags and date parameters for search filters. and not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): # We just filter on `GroupInbox.date_added` here, and don't filter by date # on the group. This keeps the query simpler and faster in some edge cases, # and date_added is a good enough proxy when we're using this sort. group_queryset = group_queryset.filter( groupinbox__date_added__gte=start, groupinbox__date_added__lte=end, ) group_queryset = group_queryset.extra(select={ "inbox_date": "sentry_groupinbox.date_added" }, ).order_by("-inbox_date") paginator = DateTimePaginator(group_queryset, "-inbox_date", **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) if sort_by == "inbox": raise InvalidSearchQuery( f"Sort key '{sort_by}' only supported for inbox search") # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") with sentry_sdk.start_span(op="snuba_group_query") as span: group_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) span.set_data("Max Candidates", max_candidates) span.set_data("Result Size", len(group_ids)) metrics.timing("snuba.search.num_candidates", len(group_ids)) too_many_candidates = False if not group_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return self.empty_result elif len(group_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True group_ids = [] sort_field = self.sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = self.calculate_hits( group_ids, too_many_candidates, sort_field, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, start, end, ) if count_hits and hits == 0: return self.empty_result paginator_results = self.empty_result result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have group_ids always query for at least that many items chunk_limit = max(chunk_limit, len(group_ids)) # {group_id: group_score, ...} snuba_groups, total = self.snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, group_ids=group_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if group_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the group_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) if group_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def format_search_filter(term, params): projects_to_filter = [ ] # Used to avoid doing multiple conditions on project ID conditions = [] group_ids = None name = term.key.name value = term.value.value if name in (PROJECT_ALIAS, PROJECT_NAME_ALIAS): if term.operator == "=" and value == "": raise InvalidSearchQuery( "Invalid query for 'has' search: 'project' cannot be empty.") slugs = to_list(value) projects = { p.slug: p.id for p in Project.objects.filter( id__in=params.get("project_id", []), slug__in=slugs) } missing = [slug for slug in slugs if slug not in projects] if missing and term.operator in EQUALITY_OPERATORS: raise InvalidSearchQuery( f"Invalid query. Project(s) {', '.join(missing)} do not exist or are not actively selected." ) project_ids = list(sorted(projects.values())) if project_ids: # Create a new search filter with the correct values term = SearchFilter( SearchKey("project_id"), term.operator, SearchValue( project_ids if term.is_in_filter else project_ids[0]), ) converted_filter = convert_search_filter_to_snuba_query(term) if converted_filter: if term.operator in EQUALITY_OPERATORS: projects_to_filter = project_ids conditions.append(converted_filter) elif name == ISSUE_ID_ALIAS and value != "": # A blank term value means that this is a has filter group_ids = to_list(value) elif name == ISSUE_ALIAS: operator = term.operator value = to_list(value) # `unknown` is a special value for when there is no issue associated with the event group_short_ids = [v for v in value if v and v != "unknown"] filter_values = ["" for v in value if not v or v == "unknown"] if group_short_ids and params and "organization_id" in params: try: groups = Group.objects.by_qualified_short_id_bulk( params["organization_id"], group_short_ids, ) except Exception: raise InvalidSearchQuery( f"Invalid value '{group_short_ids}' for 'issue:' filter") else: filter_values.extend(sorted([g.id for g in groups])) term = SearchFilter( SearchKey("issue.id"), operator, SearchValue( filter_values if term.is_in_filter else filter_values[0]), ) converted_filter = convert_search_filter_to_snuba_query(term) conditions.append(converted_filter) elif (name == RELEASE_ALIAS and params and (value == "latest" or term.is_in_filter and any(v == "latest" for v in value))): value = [ parse_release( v, params["project_id"], params.get("environment_objects"), params.get("organization_id"), ) for v in to_list(value) ] converted_filter = convert_search_filter_to_snuba_query( SearchFilter( term.key, term.operator, SearchValue(value if term.is_in_filter else value[0]), )) if converted_filter: conditions.append(converted_filter) else: converted_filter = convert_search_filter_to_snuba_query(term, params=params) if converted_filter: conditions.append(converted_filter) return conditions, projects_to_filter, group_ids
def get_filter(query=None, params=None): """ Returns an eventstore filter given the search text provided by the user and URL params """ # NOTE: this function assumes project permissions check already happened parsed_terms = [] if query is not None: try: parsed_terms = parse_search_query(query, allow_boolean=True, params=params) except ParseError as e: raise InvalidSearchQuery( f"Parse error: {e.expr.name} (column {e.column():d})") kwargs = { "start": None, "end": None, "conditions": [], "having": [], "user_id": None, "organization_id": None, "project_ids": [], "group_ids": [], "condition_aggregates": [], "aliases": params.get("aliases", {}) if params is not None else {}, } projects_to_filter = [] if any( isinstance(term, ParenExpression) or SearchBoolean.is_operator(term) for term in parsed_terms): ( condition, having, found_projects_to_filter, group_ids, ) = convert_search_boolean_to_snuba_query(parsed_terms, params) if condition: and_conditions = flatten_condition_tree(condition, SNUBA_AND) for func in and_conditions: kwargs["conditions"].append( convert_function_to_condition(func)) if having: kwargs["condition_aggregates"] = [ term.key.name for term in parsed_terms if isinstance(term, AggregateFilter) ] and_having = flatten_condition_tree(having, SNUBA_AND) for func in and_having: kwargs["having"].append(convert_function_to_condition(func)) if found_projects_to_filter: projects_to_filter = list(set(found_projects_to_filter)) if group_ids is not None: kwargs["group_ids"].extend(list(set(group_ids))) else: projects_to_filter = set() for term in parsed_terms: if isinstance(term, SearchFilter): conditions, found_projects_to_filter, group_ids = format_search_filter( term, params) if len(conditions) > 0: kwargs["conditions"].extend(conditions) if found_projects_to_filter: projects_to_filter.update(found_projects_to_filter) if group_ids is not None: kwargs["group_ids"].extend(group_ids) elif isinstance(term, AggregateFilter): converted_filter = convert_aggregate_filter_to_snuba_query( term, params) kwargs["condition_aggregates"].append(term.key.name) if converted_filter: kwargs["having"].append(converted_filter) projects_to_filter = list(projects_to_filter) # Keys included as url params take precedent if same key is included in search # They are also considered safe and to have had access rules applied unlike conditions # from the query string. if params: for key in ("start", "end"): kwargs[key] = params.get(key, None) # OrganizationEndpoint.get_filter() uses project_id, but eventstore.Filter uses project_ids if "user_id" in params: kwargs["user_id"] = params["user_id"] if "organization_id" in params: kwargs["organization_id"] = params["organization_id"] if "project_id" in params: if projects_to_filter: kwargs["project_ids"] = projects_to_filter else: kwargs["project_ids"] = params["project_id"] if "environment" in params: term = SearchFilter(SearchKey("environment"), "=", SearchValue(params["environment"])) kwargs["conditions"].append( convert_search_filter_to_snuba_query(term)) if "group_ids" in params: kwargs["group_ids"] = to_list(params["group_ids"]) # Deprecated alias, use `group_ids` instead if ISSUE_ID_ALIAS in params: kwargs["group_ids"] = to_list(params["issue.id"]) return eventstore.Filter(**kwargs)
def convert_search_boolean_to_snuba_query(terms, params=None): if len(terms) == 1: return convert_snuba_condition_to_function(terms[0], params) # Filter out any ANDs since we can assume anything without an OR is an AND. Also do some # basic sanitization of the query: can't have two operators next to each other, and can't # start or end a query with an operator. prev = None new_terms = [] for term in terms: if prev: if SearchBoolean.is_operator(prev) and SearchBoolean.is_operator( term): raise InvalidSearchQuery( f"Missing condition in between two condition operators: '{prev} {term}'" ) else: if SearchBoolean.is_operator(term): raise InvalidSearchQuery( f"Condition is missing on the left side of '{term}' operator" ) if term != SearchBoolean.BOOLEAN_AND: new_terms.append(term) prev = term if SearchBoolean.is_operator(term): raise InvalidSearchQuery( f"Condition is missing on the right side of '{term}' operator") terms = new_terms # We put precedence on AND, which sort of counter-intuitevely means we have to split the query # on ORs first, so the ANDs are grouped together. Search through the query for ORs and split the # query on each OR. # We want to maintain a binary tree, so split the terms on the first OR we can find and recurse on # the two sides. If there is no OR, split the first element out to AND index = None lhs, rhs = None, None operator = None try: index = terms.index(SearchBoolean.BOOLEAN_OR) lhs, rhs = terms[:index], terms[index + 1:] operator = SNUBA_OR except Exception: lhs, rhs = terms[:1], terms[1:] operator = SNUBA_AND ( lhs_condition, lhs_having, projects_to_filter, group_ids, ) = convert_search_boolean_to_snuba_query(lhs, params) ( rhs_condition, rhs_having, rhs_projects_to_filter, rhs_group_ids, ) = convert_search_boolean_to_snuba_query(rhs, params) projects_to_filter.extend(rhs_projects_to_filter) group_ids.extend(rhs_group_ids) if operator == SNUBA_OR and (lhs_condition or rhs_condition) and (lhs_having or rhs_having): raise InvalidSearchQuery( "Having an OR between aggregate filters and normal filters is invalid." ) condition, having = None, None if lhs_condition or rhs_condition: args = filter(None, [lhs_condition, rhs_condition]) if not args: condition = None elif len(args) == 1: condition = args[0] else: condition = [operator, args] if lhs_having or rhs_having: args = filter(None, [lhs_having, rhs_having]) if not args: having = None elif len(args) == 1: having = args[0] else: having = [operator, args] return condition, having, projects_to_filter, group_ids
def convert_search_filter_to_snuba_query(search_filter, key=None, params=None): name = search_filter.key.name if key is None else key value = search_filter.value.value # We want to use group_id elsewhere so shouldn't be removed from the dataset # but if a user has a tag with the same name we want to make sure that works if name in {"group_id"}: name = f"tags[{name}]" if name in NO_CONVERSION_FIELDS: return elif name == "id" and search_filter.value.is_wildcard(): raise InvalidSearchQuery( "Wildcard conditions are not permitted on `id` field.") elif name == "environment": # conditions added to env_conditions are OR'd env_conditions = [] values = set(value if isinstance(value, (list, tuple)) else [value]) # the "no environment" environment is null in snuba if "" in values: values.remove("") operator = "IS NULL" if search_filter.operator == "=" else "IS NOT NULL" env_conditions.append(["environment", operator, None]) if len(values) == 1: operator = "=" if search_filter.operator in EQUALITY_OPERATORS else "!=" env_conditions.append(["environment", operator, values.pop()]) elif values: operator = "IN" if search_filter.operator in EQUALITY_OPERATORS else "NOT IN" env_conditions.append(["environment", operator, values]) return env_conditions elif name == "message": if search_filter.value.is_wildcard(): # XXX: We don't want the '^$' values at the beginning and end of # the regex since we want to find the pattern anywhere in the # message. Strip off here value = search_filter.value.value[1:-1] return [["match", ["message", f"'(?i){value}'"]], search_filter.operator, 1] elif value == "": operator = "=" if search_filter.operator == "=" else "!=" return [["equals", ["message", f"{value}"]], operator, 1] else: # https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/#position-haystack-needle # positionCaseInsensitive returns 0 if not found and an index of 1 or more if found # so we should flip the operator here operator = "!=" if search_filter.operator in EQUALITY_OPERATORS else "=" if search_filter.is_in_filter: # XXX: This `toString` usage is unnecessary, but we need it in place to # trick the legacy Snuba language into not treating `message` as a # function. Once we switch over to snql it can be removed. return [ [ "multiSearchFirstPositionCaseInsensitive", [["toString", ["message"]], ["array", [f"'{v}'" for v in value]]], ], operator, 0, ] # make message search case insensitive return [["positionCaseInsensitive", ["message", f"'{value}'"]], operator, 0] elif name in ARRAY_FIELDS and search_filter.value.is_wildcard(): # Escape and convert meta characters for LIKE expressions. raw_value = search_filter.value.raw_value like_value = raw_value.replace("%", "\\%").replace("_", "\\_").replace("*", "%") operator = "LIKE" if search_filter.operator == "=" else "NOT LIKE" return [name, operator, like_value] elif name in ARRAY_FIELDS and search_filter.is_in_filter: operator = "=" if search_filter.operator == "IN" else "!=" # XXX: This `arrayConcat` usage is unnecessary, but we need it in place to # trick the legacy Snuba language into not treating `name` as a # function. Once we switch over to snql it can be removed. return [ [ "hasAny", [["arrayConcat", [name]], ["array", [f"'{v}'" for v in value]]] ], operator, 1, ] elif name == "transaction.status": # Handle "has" queries if search_filter.value.raw_value == "": return [["isNull", [name]], search_filter.operator, 1] if search_filter.is_in_filter: internal_value = [ translate_transaction_status(val) for val in search_filter.value.raw_value ] else: internal_value = translate_transaction_status( search_filter.value.raw_value) return [name, search_filter.operator, internal_value] elif name == "issue.id": # Handle "has" queries if (search_filter.value.raw_value == "" or search_filter.is_in_filter and [v for v in value if not v]): # The state of having no issues is represented differently on transactions vs # other events. On the transactions table, it is represented by 0 whereas it is # represented by NULL everywhere else. We use coalesce here so we can treat this # consistently name = ["coalesce", [name, 0]] if search_filter.is_in_filter: value = [v if v else 0 for v in value] else: value = 0 # Skip isNull check on group_id value as we want to # allow snuba's prewhere optimizer to find this condition. return [name, search_filter.operator, value] elif name == USER_DISPLAY_ALIAS: user_display_expr = FIELD_ALIASES[USER_DISPLAY_ALIAS].get_expression( params) # Handle 'has' condition if search_filter.value.raw_value == "": return [["isNull", [user_display_expr]], search_filter.operator, 1] if search_filter.value.is_wildcard(): return [ ["match", [user_display_expr, f"'(?i){value}'"]], search_filter.operator, 1, ] return [user_display_expr, search_filter.operator, value] elif name == ERROR_UNHANDLED_ALIAS: # This field is the inversion of error.handled, otherwise the logic is the same. if search_filter.value.raw_value == "": output = 0 if search_filter.operator == "!=" else 1 return [["isHandled", []], "=", output] if value in ("1", 1): return [["notHandled", []], "=", 1] if value in ("0", 0): return [["isHandled", []], "=", 1] raise InvalidSearchQuery( "Invalid value for error.unhandled condition. Accepted values are 1, 0" ) elif name == "error.handled": # Treat has filter as equivalent to handled if search_filter.value.raw_value == "": output = 1 if search_filter.operator == "!=" else 0 return [["isHandled", []], "=", output] # Null values and 1 are the same, and both indicate a handled error. if value in ("1", 1): return [["isHandled", []], "=", 1] if value in ( "0", 0, ): return [["notHandled", []], "=", 1] raise InvalidSearchQuery( "Invalid value for error.handled condition. Accepted values are 1, 0" ) elif name == KEY_TRANSACTION_ALIAS: key_transaction_expr = FIELD_ALIASES[ KEY_TRANSACTION_ALIAS].get_expression(params) if search_filter.value.raw_value == "": operator = "!=" if search_filter.operator == "!=" else "=" return [key_transaction_expr, operator, 0] if value in ("1", 1): return [key_transaction_expr, "=", 1] if value in ("0", 0): return [key_transaction_expr, "=", 0] raise InvalidSearchQuery( "Invalid value for key_transaction condition. Accepted values are 1, 0" ) elif name in ARRAY_FIELDS and search_filter.value.raw_value == "": return [["notEmpty", [name]], "=", 1 if search_filter.operator == "!=" else 0] else: # timestamp{,.to_{hour,day}} need a datetime string # last_seen needs an integer if isinstance(value, datetime) and name not in { "timestamp", "timestamp.to_hour", "timestamp.to_day", }: value = int(to_timestamp(value)) * 1000 # most field aliases are handled above but timestamp.to_{hour,day} are # handled here if name in FIELD_ALIASES: name = FIELD_ALIASES[name].get_expression(params) # Tags are never null, but promoted tags are columns and so can be null. # To handle both cases, use `ifNull` to convert to an empty string and # compare so we need to check for empty values. if search_filter.key.is_tag: name = ["ifNull", [name, "''"]] # Handle checks for existence if search_filter.operator in ( "=", "!=") and search_filter.value.value == "": if search_filter.key.is_tag: return [name, search_filter.operator, value] else: # If not a tag, we can just check that the column is null. return [["isNull", [name]], search_filter.operator, 1] is_null_condition = None # TODO(wmak): Skip this for all non-nullable keys not just event.type if (search_filter.operator in ("!=", "NOT IN") and not search_filter.key.is_tag and name != "event.type"): # Handle null columns on inequality comparisons. Any comparison # between a value and a null will result to null, so we need to # explicitly check for whether the condition is null, and OR it # together with the inequality check. # We don't need to apply this for tags, since if they don't exist # they'll always be an empty string. is_null_condition = [["isNull", [name]], "=", 1] if search_filter.value.is_wildcard(): condition = [["match", [name, f"'(?i){value}'"]], search_filter.operator, 1] else: condition = [name, search_filter.operator, value] # We only want to return as a list if we have the check for null # present. Returning as a list causes these conditions to be ORed # together. Otherwise just return the raw condition, so that it can be # used correctly in aggregates. if is_null_condition: return [is_null_condition, condition] else: return condition
def get_event_stats_data( self, request: Request, organization: Organization, get_event_stats: Callable[ [Sequence[str], str, Dict[str, str], int, bool, Optional[timedelta]], SnubaTSResult ], top_events: int = 0, query_column: str = "count()", params: Optional[Dict[str, Any]] = None, query: Optional[str] = None, allow_partial_buckets: bool = False, zerofill_results: bool = True, comparison_delta: Optional[timedelta] = None, ) -> Dict[str, Any]: with self.handle_query_errors(): with sentry_sdk.start_span( op="discover.endpoint", description="base.stats_query_creation" ): columns = request.GET.getlist("yAxis", [query_column]) if query is None: query = request.GET.get("query") if params is None: try: # events-stats is still used by events v1 which doesn't require global views params = self.get_snuba_params( request, organization, check_global_views=False ) except NoProjects: return {"data": []} try: rollup = get_rollup_from_request( request, params, default_interval=None, error=InvalidSearchQuery(), top_events=top_events, ) # If the user sends an invalid interval, use the default instead except InvalidSearchQuery: sentry_sdk.set_tag("user.invalid_interval", request.GET.get("interval")) date_range = params["end"] - params["start"] stats_period = parse_stats_period(get_interval_from_range(date_range, False)) rollup = int(stats_period.total_seconds()) if stats_period is not None else 3600 if comparison_delta is not None: retention = quotas.get_event_retention(organization=organization) comparison_start = params["start"] - comparison_delta if retention and comparison_start < timezone.now() - timedelta(days=retention): raise ValidationError("Comparison period is outside your retention window") # Backwards compatibility for incidents which uses the old # column aliases as it straddles both versions of events/discover. # We will need these aliases until discover2 flags are enabled for all # users. # We need these rollup columns to generate correct events-stats results column_map = { "user_count": "count_unique(user)", "event_count": "count()", "epm()": "epm(%d)" % rollup, "eps()": "eps(%d)" % rollup, "tpm()": "tpm(%d)" % rollup, "tps()": "tps(%d)" % rollup, } query_columns = [column_map.get(column, column) for column in columns] with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_query"): result = get_event_stats( query_columns, query, params, rollup, zerofill_results, comparison_delta ) serializer = SnubaTSResultSerializer(organization, None, request.user) with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_serialization"): # When the request is for top_events, result can be a SnubaTSResult in the event that # there were no top events found. In this case, result contains a zerofilled series # that acts as a placeholder. is_multiple_axis = len(query_columns) > 1 if top_events > 0 and isinstance(result, dict): results = {} for key, event_result in result.items(): if is_multiple_axis: results[key] = self.serialize_multiple_axis( serializer, event_result, columns, query_columns, allow_partial_buckets, zerofill_results=zerofill_results, ) else: # Need to get function alias if count is a field, but not the axis results[key] = serializer.serialize( event_result, column=resolve_axis_column(query_columns[0]), allow_partial_buckets=allow_partial_buckets, zerofill_results=zerofill_results, ) serialized_result = results elif is_multiple_axis: serialized_result = self.serialize_multiple_axis( serializer, result, columns, query_columns, allow_partial_buckets, zerofill_results=zerofill_results, ) else: extra_columns = None if comparison_delta: extra_columns = ["comparisonCount"] serialized_result = serializer.serialize( result, resolve_axis_column(query_columns[0]), allow_partial_buckets=allow_partial_buckets, zerofill_results=zerofill_results, extra_columns=extra_columns, ) return serialized_result
def get_event_stats_data( self, request, organization, get_event_stats, top_events=0, query_column="count()", params=None, query=None, allow_partial_buckets=False, ): with self.handle_query_errors(): with sentry_sdk.start_span( op="discover.endpoint", description="base.stats_query_creation"): columns = request.GET.getlist("yAxis", [query_column]) if query is None: query = request.GET.get("query") if params is None: try: # events-stats is still used by events v1 which doesn't require global views params = self.get_snuba_params( request, organization, check_global_views=False) except NoProjects: return {"data": []} rollup = get_rollup_from_request( request, params, default_interval=None, error=InvalidSearchQuery( "Your interval and date range would create too many results. " "Use a larger interval, or a smaller date range."), top_events=top_events, ) # Backwards compatibility for incidents which uses the old # column aliases as it straddles both versions of events/discover. # We will need these aliases until discover2 flags are enabled for all # users. # We need these rollup columns to generate correct events-stats results column_map = { "user_count": "count_unique(user)", "event_count": "count()", "epm()": "epm(%d)" % rollup, "eps()": "eps(%d)" % rollup, "tpm()": "tpm(%d)" % rollup, "tps()": "tps(%d)" % rollup, } query_columns = [ column_map.get(column, column) for column in columns ] with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_query"): result = get_event_stats(query_columns, query, params, rollup) serializer = SnubaTSResultSerializer(organization, None, request.user) with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_serialization"): # When the request is for top_events, result can be a SnubaTSResult in the event that # there were no top events found. In this case, result contains a zerofilled series # that acts as a placeholder. if top_events > 0 and isinstance(result, dict): results = {} for key, event_result in result.items(): if len(query_columns) > 1: results[key] = self.serialize_multiple_axis( serializer, event_result, columns, query_columns, allow_partial_buckets) else: # Need to get function alias if count is a field, but not the axis results[key] = serializer.serialize( event_result, column=resolve_axis_column(query_columns[0]), allow_partial_buckets=allow_partial_buckets, ) return results elif len(query_columns) > 1: return self.serialize_multiple_axis(serializer, result, columns, query_columns, allow_partial_buckets) else: return serializer.serialize( result, resolve_axis_column(query_columns[0]), allow_partial_buckets=allow_partial_buckets, )