def convert_status_value(value, projects, user, environments): try: return parse_status_value(value) except ValueError: raise InvalidSearchQuery(u"invalid status value of '{}'".format(value))
def timeseries_query(selected_columns, query, params, rollup, reference_event=None, referrer=None): """ High-level API for doing arbitrary user timeseries queries against events. This function operates on the public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. This function is intended to only get timeseries based results and thus requires the `rollup` parameter. Returns a SnubaTSResult object that has been zerofilled in case of gaps. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment, rollup (int) The bucket width in seconds reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. """ snuba_filter = get_filter(query, params) snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, } if not snuba_args["start"] and not snuba_args["end"]: raise InvalidSearchQuery( "Cannot get timeseries result without a start and end.") snuba_args.update( resolve_field_list(selected_columns, snuba_args, auto_fields=False)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_args["conditions"].extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_args, _ = resolve_discover_aliases(snuba_args) if not snuba_args["aggregations"]: raise InvalidSearchQuery( "Cannot get timeseries result with no aggregation.") # Change the alias of the first aggregation to count. This ensures compatibility # with other parts of the timeseries endpoint expectations if len(snuba_args["aggregations"]) == 1: snuba_args["aggregations"][0][2] = "count" result = raw_query( aggregations=snuba_args.get("aggregations"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), start=snuba_args.get("start"), end=snuba_args.get("end"), rollup=rollup, orderby="time", groupby=["time"], dataset=Dataset.Discover, limit=10000, referrer=referrer, ) result = zerofill(result["data"], snuba_args["start"], snuba_args["end"], rollup, "time") return SnubaTSResult({"data": result}, snuba_filter.start, snuba_filter.end, rollup)
def visit_boolean_operator(self, node, children): raise InvalidSearchQuery( 'Boolean statements containing "OR" or "AND" are not supported in this search' )
def find_histogram_buckets(field, params, conditions): match = is_function(field) if not match: raise InvalidSearchQuery( u"received {}, expected histogram function".format(field)) columns = [ c.strip() for c in match.group("columns").split(",") if len(c.strip()) > 0 ] if len(columns) != 2: raise InvalidSearchQuery( u"histogram(...) expects 2 column arguments, received {:g} arguments" .format(len(columns))) column = columns[0] # TODO evanh: This can be expanded to more fields at a later date, for now keep this limited. if column != "transaction.duration": raise InvalidSearchQuery( "histogram(...) can only be used with the transaction.duration column" ) try: num_buckets = int(columns[1]) if num_buckets < 1 or num_buckets > 500: raise Exception() except Exception: raise InvalidSearchQuery( u"histogram(...) requires a bucket value between 1 and 500, not {}" .format(columns[1])) max_alias = u"max_{}".format(column) min_alias = u"min_{}".format(column) conditions = deepcopy(conditions) if conditions else [] found = False for cond in conditions: if len(cond) == 3 and (cond[0], cond[1], cond[2]) == ("event.type", "=", "transaction"): found = True break if not found: conditions.append(["event.type", "=", "transaction"]) snuba_filter = eventstore.Filter(conditions=conditions) translated_args, _ = resolve_discover_aliases(snuba_filter) results = raw_query( filter_keys={"project_id": params.get("project_id")}, start=params.get("start"), end=params.get("end"), dataset=Dataset.Discover, conditions=translated_args.conditions, aggregations=[["max", "duration", max_alias], ["min", "duration", min_alias]], ) if len(results["data"]) != 1: # If there are no transactions, so no max duration, return one empty bucket return "histogram({}, 1, 1, 0)".format(column) bucket_min = results["data"][0][min_alias] bucket_max = results["data"][0][max_alias] if bucket_max == 0: raise InvalidSearchQuery( u"Cannot calculate histogram for {}".format(field)) bucket_size = ceil((bucket_max - bucket_min) / float(num_buckets)) if bucket_size == 0.0: bucket_size = 1.0 # Determine the first bucket that will show up in our results so that we can # zerofill correctly. offset = int(floor(bucket_min / bucket_size) * bucket_size) return "histogram({}, {:g}, {:.0f}, {:.0f})".format( column, num_buckets, bucket_size, offset)
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, reference_event=None, referrer=None, auto_fields=False, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. """ snuba_filter = get_filter(query, params) # TODO(mark) Refactor the need for this translation shim once all of # discover is using this module. Remember to update all the functions # in this module. snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, "orderby": orderby, } if not selected_columns: raise InvalidSearchQuery("No fields provided") snuba_args.update( resolve_field_list(selected_columns, snuba_args, auto_fields=auto_fields)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_args["conditions"].extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_args, translated_columns = resolve_discover_aliases(snuba_args) result = raw_query( start=snuba_args.get("start"), end=snuba_args.get("end"), groupby=snuba_args.get("groupby"), conditions=snuba_args.get("conditions"), aggregations=snuba_args.get("aggregations"), selected_columns=snuba_args.get("selected_columns"), filter_keys=snuba_args.get("filter_keys"), orderby=snuba_args.get("orderby"), dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) return transform_results(result, translated_columns, snuba_args)
def query( self, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, date_from, date_to, max_hits=None, ): now = timezone.now() end = None end_params = [ _f for _f in [date_to, get_search_filter(search_filters, "date", "<")] if _f ] if end_params: end = min(end_params) if not end: end = now + ALLOWED_FUTURE_DELTA metrics.incr("snuba.search.postgres_only") # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if (cursor is None and sort_by == "date" and # This handles tags and date parameters for search filters. not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): group_queryset = group_queryset.order_by("-last_seen") paginator = DateTimePaginator(group_queryset, "-last_seen", **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max([ _f for _f in [retention_window_start, now - timedelta(days=90)] if _f ]) start_params = [ date_from, retention_date, get_search_filter(search_filters, "date", ">") ] start = max([_f for _f in start_params if _f]) end = max([retention_date, end]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatibility # with Django search (for now). return self.empty_result if start >= end: # TODO: This maintains backwards compatibility with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return self.empty_result # This search is specific to Inbox. If we're using inbox sort and only querying # postgres then we can use this sort method. Otherwise if we need to go to Snuba, # fail. if (sort_by == "inbox" and get_search_filter(search_filters, "for_review", "=") # This handles tags and date parameters for search filters. and not [ sf for sf in search_filters if sf.key.name not in self.postgres_only_fields.union(["date"]) ]): # We just filter on `GroupInbox.date_added` here, and don't filter by date # on the group. This keeps the query simpler and faster in some edge cases, # and date_added is a good enough proxy when we're using this sort. group_queryset = group_queryset.filter( groupinbox__date_added__gte=start, groupinbox__date_added__lte=end, ) group_queryset = group_queryset.extra(select={ "inbox_date": "sentry_groupinbox.date_added" }, ).order_by("-inbox_date") paginator = DateTimePaginator(group_queryset, "-inbox_date", **paginator_options) return paginator.get_result(limit, cursor, count_hits=count_hits, max_hits=max_hits) if sort_by == "inbox": raise InvalidSearchQuery( f"Sort key '{sort_by}' only supported for inbox search") # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get("snuba.search.max-pre-snuba-candidates") with sentry_sdk.start_span(op="snuba_group_query") as span: group_ids = list( group_queryset.values_list("id", flat=True)[:max_candidates + 1]) span.set_data("Max Candidates", max_candidates) span.set_data("Result Size", len(group_ids)) metrics.timing("snuba.search.num_candidates", len(group_ids)) too_many_candidates = False if not group_ids: # no matches could possibly be found from this point on metrics.incr("snuba.search.no_candidates", skip_internal=False) return self.empty_result elif len(group_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr("snuba.search.too_many_candidates", skip_internal=False) too_many_candidates = True group_ids = [] sort_field = self.sort_strategies[sort_by] chunk_growth = options.get("snuba.search.chunk-growth-rate") max_chunk_size = options.get("snuba.search.max-chunk-size") chunk_limit = limit offset = 0 num_chunks = 0 hits = self.calculate_hits( group_ids, too_many_candidates, sort_field, projects, retention_window_start, group_queryset, environments, sort_by, limit, cursor, count_hits, paginator_options, search_filters, start, end, ) if count_hits and hits == 0: return self.empty_result paginator_results = self.empty_result result_groups = [] result_group_ids = set() max_time = options.get("snuba.search.max-total-chunk-time-seconds") time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have group_ids always query for at least that many items chunk_limit = max(chunk_limit, len(group_ids)) # {group_id: group_score, ...} snuba_groups, total = self.snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], sort_field=sort_field, cursor=cursor, group_ids=group_ids, limit=chunk_limit, offset=offset, search_filters=search_filters, ) metrics.timing("snuba.search.num_snuba_results", len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if group_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the group_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits and hits is None: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups]).values_list("id", flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options).get_result(limit, cursor, known_hits=hits, max_hits=max_hits) if group_ids or len( paginator_results.results) >= limit or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing("snuba.search.num_chunks", num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [ groups[k] for k in paginator_results.results if k in groups ] return paginator_results
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, referrer=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. auto_aggregations (bool) Whether aggregates should be added automatically if they're used in conditions, and there's at least one aggregate already. use_aggregate_conditions (bool) Set to true if aggregates conditions should be used at all. conditions (Sequence[any]) List of conditions that are passed directly to snuba without any additional processing. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") else: # We clobber this value throughout this code, so copy the value selected_columns = selected_columns[:] with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: assert ( not auto_aggregations ), "Auto aggregations cannot be used without enabling aggregate conditions" snuba_filter.having = [] # We need to run a separate query to be able to properly bucket the values for the histogram # Do that here, and format the bucket number in to the columns before passing it through # to event search. idx = 0 function_translations = {} for col in selected_columns: if col.startswith("histogram("): with sentry_sdk.start_span( op="discover.discover", description="query.histogram_calculation") as span: span.set_data("histogram", col) histogram_column = find_histogram_buckets( col, params, snuba_filter.conditions) selected_columns[idx] = histogram_column snuba_name = get_function_alias(histogram_column) sentry_name = get_function_alias(col) function_translations[snuba_name] = sentry_name # Since we're completely renaming the histogram function, we need to also check if we are # ordering by the histogram values, and change that. if orderby is not None: orderby = list(orderby) if isinstance( orderby, (list, tuple)) else [orderby] for i, ordering in enumerate(orderby): if sentry_name == ordering.lstrip("-"): ordering = "{}{}".format( "-" if ordering.startswith("-") else "", snuba_name) orderby[i] = ordering break idx += 1 with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): if orderby is not None: orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby] snuba_filter.orderby = [get_function_alias(o) for o in orderby] resolved_fields = resolve_field_list( selected_columns, snuba_filter, auto_fields=auto_fields, auto_aggregations=auto_aggregations, functions_acl=functions_acl, ) snuba_filter.update_with(resolved_fields) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter, function_translations) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. error_extra = u", and could not be automatically added" if auto_aggregations else u"" if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) # Only need to iterate on arg[1] if its a list elif isinstance(arg[1], (list, tuple)): alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( u"Aggregate(s) {} used in a condition but are not in the selected columns{}." .format( ", ".join(conditions_not_in_aggregations), error_extra, )) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column{}." .format( having_clause[0], error_extra, )) if conditions is not None: snuba_filter.conditions.extend(conditions) with sentry_sdk.start_span(op="discover.discover", description="query.snuba_query"): result = raw_query( start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, filter_keys=snuba_filter.filter_keys, having=snuba_filter.having, orderby=snuba_filter.orderby, dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) with sentry_sdk.start_span(op="discover.discover", description="query.transform_results") as span: span.set_data("result_count", len(result.get("data", []))) return transform_results(result, resolved_fields["functions"], translated_columns, snuba_filter, selected_columns)
def get_event_stats_data( self, request, organization, get_event_stats, top_events=0, query_column="count()", params=None, query=None, ): with self.handle_query_errors(): with sentry_sdk.start_span( op="discover.endpoint", description="base.stats_query_creation"): columns = request.GET.getlist("yAxis", [query_column]) if query is None: query = request.GET.get("query") if params is None: try: # events-stats is still used by events v1 which doesn't require global views params = self.get_snuba_params( request, organization, check_global_views=False) except NoProjects: return {"data": []} rollup = get_rollup_from_request( request, params, "1h", InvalidSearchQuery( "Your interval and date range would create too many results. " "Use a larger interval, or a smaller date range."), top_events=top_events, ) # Backwards compatibility for incidents which uses the old # column aliases as it straddles both versions of events/discover. # We will need these aliases until discover2 flags are enabled for all # users. # We need these rollup columns to generate correct events-stats results column_map = { "user_count": "count_unique(user)", "event_count": "count()", "epm()": "epm(%d)" % rollup, "eps()": "eps(%d)" % rollup, "tpm()": "tpm(%d)" % rollup, "tps()": "tps(%d)" % rollup, } query_columns = [ column_map.get(column, column) for column in columns ] with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_query"): result = get_event_stats(query_columns, query, params, rollup) serializer = SnubaTSResultSerializer(organization, None, request.user) with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_serialization"): # When the request is for top_events, result can be a SnubaTSResult in the event that # there were no top events found. In this case, result contains a zerofilled series # that acts as a placeholder. if top_events > 0 and isinstance(result, dict): results = {} for key, event_result in result.items(): if len(query_columns) > 1: results[key] = self.serialize_multiple_axis( serializer, event_result, columns, query_columns) else: # Need to get function alias if count is a field, but not the axis results[key] = serializer.serialize( event_result, column=get_function_alias(query_columns[0])) return results elif len(query_columns) > 1: return self.serialize_multiple_axis(serializer, result, columns, query_columns) else: return serializer.serialize(result)
def get_event_stats_data(self, request, organization, get_event_stats, top_events=False): try: with sentry_sdk.start_span( op="discover.endpoint", description="base.stats_query_creation"): columns = request.GET.getlist("yAxis", ["count()"]) query = request.GET.get("query") try: params = self.get_filter_params(request, organization) except NoProjects: return {"data": []} params = self.quantize_date_params(request, params) rollup = get_rollup_from_request( request, params, "1h", InvalidSearchQuery( "Your interval and date range would create too many results. " "Use a larger interval, or a smaller date range."), ) # Backwards compatibility for incidents which uses the old # column aliases as it straddles both versions of events/discover. # We will need these aliases until discover2 flags are enabled for all # users. column_map = { "user_count": "count_unique(user)", "event_count": "count()", "epm()": "epm(%d)" % rollup, "eps()": "eps(%d)" % rollup, } query_columns = [ column_map.get(column, column) for column in columns ] reference_event = self.reference_event(request, organization, params.get("start"), params.get("end")) with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_query"): result = get_event_stats(query_columns, query, params, rollup, reference_event) except (discover.InvalidSearchQuery, snuba.QueryOutsideRetentionError) as error: raise ParseError(detail=six.text_type(error)) serializer = SnubaTSResultSerializer(organization, None, request.user) with sentry_sdk.start_span(op="discover.endpoint", description="base.stats_serialization"): if top_events: results = {} for key, event_result in six.iteritems(result): if len(query_columns) > 1: results[key] = self.serialize_multiple_axis( serializer, event_result, columns, query_columns) else: # Need to get function alias if count is a field, but not the axis results[key] = serializer.serialize( event_result, get_function_alias(query_columns[0])) return results elif len(query_columns) > 1: return self.serialize_multiple_axis(serializer, result, columns, query_columns) else: return serializer.serialize(result)
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, reference_event=None, referrer=None, auto_fields=False, use_aggregate_conditions=False, conditions=None, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. conditions (Sequence[any]) List of conditions that are passed directly to snuba without any additional processing. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) # TODO(evanh): These can be removed once we migrate the frontend / saved queries # to use the new function values selected_columns, function_translations = transform_deprecated_functions_in_columns( selected_columns) query = transform_deprecated_functions_in_query(query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: snuba_filter.having = [] # We need to run a separate query to be able to properly bucket the values for the histogram # Do that here, and format the bucket number in to the columns before passing it through # to event search. idx = 0 for col in selected_columns: if col.startswith("histogram("): with sentry_sdk.start_span( op="discover.discover", description="query.histogram_calculation") as span: span.set_data("histogram", col) histogram_column = find_histogram_buckets( col, params, snuba_filter.conditions) selected_columns[idx] = histogram_column function_translations[get_function_alias( histogram_column)] = get_function_alias(col) break idx += 1 with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): # Check to see if we are ordering by any functions and convert the orderby to be the correct alias. if orderby: orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby] new_orderby = [] for ordering in orderby: is_reversed = ordering.startswith("-") ordering = ordering.lstrip("-") for snuba_name, sentry_name in six.iteritems( function_translations): if sentry_name == ordering: ordering = snuba_name break ordering = "{}{}".format("-" if is_reversed else "", ordering) new_orderby.append(ordering) snuba_filter.orderby = new_orderby snuba_filter.update_with( resolve_field_list(selected_columns, snuba_filter, auto_fields=auto_fields)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_filter.conditions.extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter, function_translations) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) else: alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( u"Aggregate(s) {} used in a condition but are not in the selected columns." .format(", ".join(conditions_not_in_aggregations))) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column." .format(having_clause[0])) if conditions is not None: snuba_filter.conditions.extend(conditions) with sentry_sdk.start_span(op="discover.discover", description="query.snuba_query"): result = raw_query( start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, filter_keys=snuba_filter.filter_keys, having=snuba_filter.having, orderby=snuba_filter.orderby, dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) with sentry_sdk.start_span(op="discover.discover", description="query.transform_results") as span: span.set_data("result_count", len(result.get("data", []))) return transform_results(result, translated_columns, snuba_filter, selected_columns)
def histogram_query( fields, user_query, params, num_buckets, precision=0, min_value=None, max_value=None, data_filter=None, referrer=None, ): """ API for generating histograms for numeric columns. A multihistogram is possible only if the columns are all measurements. The resulting histograms will have their bins aligned. :param [str] fields: The list of fields for which you want to generate histograms for. :param str user_query: Filter query string to create conditions from. :param {str: str} params: Filtering parameters with start, end, project_id, environment :param int num_buckets: The number of buckets the histogram should contain. :param int precision: The number of decimal places to preserve, default 0. :param float min_value: The minimum value allowed to be in the histogram. If left unspecified, it is queried using `user_query` and `params`. :param float max_value: The maximum value allowed to be in the histogram. If left unspecified, it is queried using `user_query` and `params`. :param str data_filter: Indicate the filter strategy to be applied to the data. """ multiplier = int(10**precision) if max_value is not None: # We want the specified max_value to be exclusive, and the queried max_value # to be inclusive. So we adjust the specified max_value using the multiplier. max_value -= 0.1 / multiplier min_value, max_value = find_histogram_min_max(fields, min_value, max_value, user_query, params, data_filter) key_column = None conditions = [] if len(fields) > 1: key_column = "array_join(measurements_key)" key_alias = get_function_alias(key_column) measurements = [] for f in fields: measurement = get_measurement_name(f) if measurement is None: raise InvalidSearchQuery( "multihistogram expected all measurements, received: {}". format(f)) measurements.append(measurement) conditions.append([key_alias, "IN", measurements]) histogram_params = find_histogram_params(num_buckets, min_value, max_value, multiplier) histogram_column = get_histogram_column(fields, key_column, histogram_params) histogram_alias = get_function_alias(histogram_column) if min_value is None or max_value is None: return normalize_histogram_results(fields, key_column, histogram_params, {"data": []}) # make sure to bound the bins to get the desired range of results if min_value is not None: min_bin = histogram_params.start_offset conditions.append([histogram_alias, ">=", min_bin]) if max_value is not None: max_bin = histogram_params.start_offset + histogram_params.bucket_size * num_buckets conditions.append([histogram_alias, "<=", max_bin]) columns = [] if key_column is None else [key_column] results = query( selected_columns=columns + [histogram_column, "count()"], conditions=conditions, query=user_query, params=params, orderby=[histogram_alias], limit=len(fields) * num_buckets, referrer=referrer, functions_acl=["array_join", "histogram"], ) return normalize_histogram_results(fields, key_column, histogram_params, results)
def query( self, projects, environments=None, sort_by="date", limit=100, cursor=None, count_hits=False, paginator_options=None, search_filters=None, date_from=None, date_to=None, ): search_filters = search_filters if search_filters is not None else [] # ensure projects are from same org if len({p.organization_id for p in projects}) != 1: raise RuntimeError("Cross organization search not supported") if paginator_options is None: paginator_options = {} # filter out groups which are beyond the retention period retention = quotas.get_event_retention(organization=projects[0].organization) if retention: retention_window_start = timezone.now() - timedelta(days=retention) else: retention_window_start = None group_queryset = self._build_group_queryset( projects=projects, environments=environments, search_filters=search_filters, retention_window_start=retention_window_start, date_from=date_from, date_to=date_to, ) query_executor = self._get_query_executor( group_queryset=group_queryset, projects=projects, environments=environments, search_filters=search_filters, date_from=date_from, date_to=date_to, ) # ensure sort strategy is supported by executor if not query_executor.has_sort_strategy(sort_by): raise InvalidSearchQuery(u"Sort key '{}' not supported.".format(sort_by)) return query_executor.query( projects=projects, retention_window_start=retention_window_start, group_queryset=group_queryset, environments=environments, sort_by=sort_by, limit=limit, cursor=cursor, count_hits=count_hits, paginator_options=paginator_options, search_filters=search_filters, date_from=date_from, date_to=date_to, )
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, reference_event=None, referrer=None, auto_fields=False, use_aggregate_conditions=False, conditions=None, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. conditions (Sequence[any]) List of conditions that are passed directly to snuba without any additional processing. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") # TODO(evanh): These can be removed once we migrate the frontend / saved queries # to use the new function values selected_columns, function_translations = transform_deprecated_functions_in_columns( selected_columns) query = transform_deprecated_functions_in_query(query) snuba_filter = get_filter(query, params) # TODO(mark) Refactor the need for this translation shim once all of # discover is using this module. Remember to update all the functions # in this module. snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, "orderby": orderby, "having": [], } if use_aggregate_conditions: snuba_args["having"] = snuba_filter.having # We need to run a separate query to be able to properly bucket the values for the histogram # Do that here, and format the bucket number in to the columns before passing it through # to event search. idx = 0 for col in selected_columns: if col.startswith("histogram("): histogram_column = find_histogram_buckets(col, params, snuba_filter.conditions) selected_columns[idx] = histogram_column function_translations[get_function_alias( histogram_column)] = get_function_alias(col) break idx += 1 # Check to see if we are ordering by any functions and convert the orderby to be the correct alias. if orderby: orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby] new_orderby = [] for ordering in orderby: is_reversed = ordering.startswith("-") ordering = ordering.lstrip("-") for snuba_name, sentry_name in six.iteritems( function_translations): if sentry_name == ordering: ordering = snuba_name break ordering = "{}{}".format("-" if is_reversed else "", ordering) new_orderby.append(ordering) snuba_args["orderby"] = new_orderby snuba_args.update( resolve_field_list(selected_columns, snuba_args, params=params, auto_fields=auto_fields)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_args["conditions"].extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_args, translated_columns = resolve_discover_aliases( snuba_args, function_translations) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_args.get("having"): found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_args.get("aggregations")) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column." .format(having_clause[0])) if conditions is not None: snuba_args["conditions"].extend(conditions) result = raw_query( start=snuba_args.get("start"), end=snuba_args.get("end"), groupby=snuba_args.get("groupby"), conditions=snuba_args.get("conditions"), aggregations=snuba_args.get("aggregations"), selected_columns=snuba_args.get("selected_columns"), filter_keys=snuba_args.get("filter_keys"), having=snuba_args.get("having"), orderby=snuba_args.get("orderby"), dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) return transform_results(result, translated_columns, snuba_args)
def find_histogram_buckets(field, params, conditions): match = is_function(field) if not match: raise InvalidSearchQuery( u"received {}, expected histogram function".format(field)) columns = [ c.strip() for c in match.group("columns").split(",") if len(c.strip()) > 0 ] if len(columns) != 2: raise InvalidSearchQuery( u"histogram(...) expects 2 column arguments, received {:g} arguments" .format(len(columns))) column = columns[0] # TODO evanh: This can be expanded to more fields at a later date, for now keep this limited. if column != "transaction.duration": raise InvalidSearchQuery( "histogram(...) can only be used with the transaction.duration column" ) try: num_buckets = int(columns[1]) if num_buckets < 1 or num_buckets > 500: raise Exception() except Exception: raise InvalidSearchQuery( u"histogram(...) requires a bucket value between 1 and 500, not {}" .format(columns[1])) alias = u"max_{}".format(column) conditions = deepcopy(conditions) if conditions else [] found = False for cond in conditions: if (cond[0], cond[1], cond[2]) == ("event.type", "=", "transaction"): found = True if not found: conditions.append(["event.type", "=", "transaction"]) translated_args, _ = resolve_discover_aliases({"conditions": conditions}) results = raw_query( filter_keys={"project_id": params.get("project_id")}, start=params.get("start"), end=params.get("end"), dataset=Dataset.Discover, conditions=translated_args["conditions"], aggregations=[["max", "duration", alias]], ) if len(results["data"]) != 1: # If there are no transactions, so no max duration, return one empty bucket return "histogram({}, 1, 1)".format(column) bucket_max = results["data"][0][alias] if bucket_max == 0: raise InvalidSearchQuery( u"Cannot calculate histogram for {}".format(field)) bucket_number = ceil(bucket_max / float(num_buckets)) return "histogram({}, {:g}, {:g})".format(column, num_buckets, bucket_number)