def query( selected_columns, query, params, orderby=None, offset=None, limit=50, referrer=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. auto_aggregations (bool) Whether aggregates should be added automatically if they're used in conditions, and there's at least one aggregate already. use_aggregate_conditions (bool) Set to true if aggregates conditions should be used at all. conditions (Sequence[any]) List of conditions that are passed directly to snuba without any additional processing. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") else: # We clobber this value throughout this code, so copy the value selected_columns = selected_columns[:] with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: assert ( not auto_aggregations ), "Auto aggregations cannot be used without enabling aggregate conditions" snuba_filter.having = [] function_translations = {} with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): if orderby is not None: orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby] snuba_filter.orderby = [get_function_alias(o) for o in orderby] resolved_fields = resolve_field_list( selected_columns, snuba_filter, auto_fields=auto_fields, auto_aggregations=auto_aggregations, functions_acl=functions_acl, ) snuba_filter.update_with(resolved_fields) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter, function_translations) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. error_extra = u", and could not be automatically added" if auto_aggregations else u"" if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) # Only need to iterate on arg[1] if its a list elif isinstance(arg[1], (list, tuple)): alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( u"Aggregate(s) {} used in a condition but are not in the selected columns{}." .format( ", ".join(conditions_not_in_aggregations), error_extra, )) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column{}." .format( having_clause[0], error_extra, )) if conditions is not None: snuba_filter.conditions.extend(conditions) with sentry_sdk.start_span(op="discover.discover", description="query.snuba_query"): result = raw_query( start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, filter_keys=snuba_filter.filter_keys, having=snuba_filter.having, orderby=snuba_filter.orderby, dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) with sentry_sdk.start_span(op="discover.discover", description="query.transform_results") as span: span.set_data("result_count", len(result.get("data", []))) return transform_results(result, resolved_fields["functions"], translated_columns, snuba_filter, selected_columns)
def timeseries_query(selected_columns, query, params, rollup, reference_event=None, referrer=None): """ High-level API for doing arbitrary user timeseries queries against events. This function operates on the public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. This function is intended to only get timeseries based results and thus requires the `rollup` parameter. Returns a SnubaTSResult object that has been zerofilled in case of gaps. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment, rollup (int) The bucket width in seconds reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. """ snuba_filter = get_filter(query, params) snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, "having": snuba_filter.having, } if not snuba_args["start"] and not snuba_args["end"]: raise InvalidSearchQuery( "Cannot get timeseries result without a start and end.") snuba_args.update( resolve_field_list(selected_columns, snuba_args, auto_fields=False)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_args["conditions"].extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_args, _ = resolve_discover_aliases(snuba_args) if not snuba_args["aggregations"]: raise InvalidSearchQuery( "Cannot get timeseries result with no aggregation.") # Change the alias of the first aggregation to count. This ensures compatibility # with other parts of the timeseries endpoint expectations if len(snuba_args["aggregations"]) == 1: snuba_args["aggregations"][0][2] = "count" result = raw_query( aggregations=snuba_args.get("aggregations"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), start=snuba_args.get("start"), end=snuba_args.get("end"), rollup=rollup, orderby="time", groupby=["time"], dataset=Dataset.Discover, limit=10000, referrer=referrer, ) result = zerofill(result["data"], snuba_args["start"], snuba_args["end"], rollup, "time") return SnubaTSResult({"data": result}, snuba_filter.start, snuba_filter.end, rollup)
def get_facets(query, params, limit=10, referrer=None): """ High-level API for getting 'facet map' results. Facets are high frequency tags and attribute results that can be used to further refine user queries. When many projects are requested sampling will be enabled to help keep response times low. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ snuba_filter = get_filter(query, params) # TODO(mark) Refactor the need for this translation shim. snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, } # Resolve the public aliases into the discover dataset names. snuba_args, translated_columns = resolve_discover_aliases(snuba_args) # Exclude tracing tags as they are noisy and generally not helpful. excluded_tags = [ "tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span", "project"] ] # Sampling keys for multi-project results as we don't need accuracy # with that much data. sample = len(snuba_filter.filter_keys["project_id"]) > 2 # Get the most frequent tag keys key_names = raw_query( aggregations=[["count", None, "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), orderby=["-count", "tags_key"], groupby="tags_key", having=[excluded_tags], dataset=Dataset.Discover, limit=limit, referrer=referrer, turbo=sample, ) top_tags = [r["tags_key"] for r in key_names["data"]] if not top_tags: return [] # TODO(mark) Make the sampling rate scale based on the result size and scaling factor in # sentry.options. To test the lowest acceptable sampling rate, we use 0.1 which # is equivalent to turbo. We don't use turbo though as we need to re-scale data, and # using turbo could cause results to be wrong if the value of turbo is changed in snuba. sample_rate = 0.1 if key_names["data"][0]["count"] > 10000 else None # Rescale the results if we're sampling multiplier = 1 / sample_rate if sample_rate is not None else 1 fetch_projects = False if len(params.get("project_id", [])) > 1: if len(top_tags) == limit: top_tags.pop() fetch_projects = True results = [] if fetch_projects: project_values = raw_query( aggregations=[["count", None, "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), groupby="project_id", orderby="-count", dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, ) results.extend([ FacetResult("project", r["project_id"], int(r["count"]) * multiplier) for r in project_values["data"] ]) # Get tag counts for our top tags. Fetching them individually # allows snuba to leverage promoted tags better and enables us to get # the value count we want. max_aggregate_tags = options.get("discover2.max_tags_to_combine") individual_tags = [] aggregate_tags = [] for i, tag in enumerate(top_tags): if tag == "environment": # Add here tags that you want to be individual individual_tags.append(tag) elif i >= len(top_tags) - max_aggregate_tags: aggregate_tags.append(tag) else: individual_tags.append(tag) for tag_name in individual_tags: tag = u"tags[{}]".format(tag_name) tag_values = raw_query( aggregations=[["count", None, "count"]], conditions=snuba_args.get("conditions"), start=snuba_args.get("start"), end=snuba_args.get("end"), filter_keys=snuba_args.get("filter_keys"), orderby=["-count"], groupby=[tag], limit=TOP_VALUES_DEFAULT_LIMIT, dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, ) results.extend([ FacetResult(tag_name, r[tag], int(r["count"]) * multiplier) for r in tag_values["data"] ]) if aggregate_tags: conditions = snuba_args.get("conditions", []) conditions.append(["tags_key", "IN", aggregate_tags]) tag_values = raw_query( aggregations=[["count", None, "count"]], conditions=conditions, start=snuba_args.get("start"), end=snuba_args.get("end"), filter_keys=snuba_args.get("filter_keys"), orderby=["tags_key", "-count"], groupby=["tags_key", "tags_value"], dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, limitby=[TOP_VALUES_DEFAULT_LIMIT, "tags_key"], ) results.extend([ FacetResult(r["tags_key"], r["tags_value"], int(r["count"])) for r in tag_values["data"] ]) return results
def get_performance_facets( query, params, orderby=None, aggregate_column="duration", aggregate_function="avg", limit=20, referrer=None, ): """ High-level API for getting 'facet map' results for performance data Performance facets are high frequency tags and the aggregate duration of their most frequent values query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ with sentry_sdk.start_span(op="discover.discover", description="facets.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"): # Get the most relevant tag keys key_names = raw_query( aggregations=[ [aggregate_function, aggregate_column, "aggregate"], ["count", None, "count"], ], start=snuba_filter.start, end=snuba_filter.end, conditions=snuba_filter.conditions, filter_keys=snuba_filter.filter_keys, orderby=["-count"], dataset=Dataset.Discover, limit=limit, referrer="{}.{}".format(referrer, "all_transactions"), ) counts = [r["count"] for r in key_names["data"]] aggregates = [r["aggregate"] for r in key_names["data"]] # Return early to avoid doing more queries with 0 count transactions or aggregates for columns that dont exist if len(counts) != 1 or counts[0] == 0 or aggregates[0] is None: return [] results = [] snuba_filter.conditions.append([aggregate_column, "IS NOT NULL", None]) # Aggregate for transaction transaction_aggregate = key_names["data"][0]["aggregate"] # Dynamically sample so at least 10000 transactions are selected transaction_count = key_names["data"][0]["count"] sampling_enabled = transaction_count > 50000 # Log growth starting at 50,000 target_sample = 50000 * (math.log(transaction_count, 10) - 3) dynamic_sample_rate = 0 if transaction_count <= 0 else (target_sample / transaction_count) sample_rate = min(max(dynamic_sample_rate, 0), 1) if sampling_enabled else None frequency_sample_rate = sample_rate if sample_rate else 1 excluded_tags = [ "tags_key", "NOT IN", [ "trace", "trace.ctx", "trace.span", "project", "browser", "celery_task_id" ], ] with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"): conditions = snuba_filter.conditions aggregate_comparison = transaction_aggregate * 1.01 if transaction_aggregate else 0 having = [excluded_tags] if orderby and orderby in ("sumdelta", "-sumdelta", "aggregate", "-aggregate"): having.append(["aggregate", ">", aggregate_comparison]) if orderby is None: orderby = [] else: orderby = [orderby] tag_values = raw_query( selected_columns=[ [ "sum", [ "minus", [ aggregate_column, str(transaction_aggregate), ], ], "sumdelta", ], ], aggregations=[ [aggregate_function, aggregate_column, "aggregate"], ["count", None, "cnt"], ], conditions=conditions, start=snuba_filter.start, end=snuba_filter.end, filter_keys=snuba_filter.filter_keys, orderby=orderby + ["tags_key"], groupby=["tags_key", "tags_value"], having=having, dataset=Dataset.Discover, referrer="{}.{}".format(referrer, "tag_values"), sample=sample_rate, turbo=sample_rate is not None, limitby=[1, "tags_key"], ) results.extend([ PerformanceFacetResult( key=r["tags_key"], value=r["tags_value"], performance=float(r["aggregate"]), frequency=float( (r["cnt"] / frequency_sample_rate) / transaction_count), comparison=float(r["aggregate"] / transaction_aggregate), sumdelta=float(r["sumdelta"]), ) for r in tag_values["data"] ]) return results
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, reference_event=None, referrer=None, auto_fields=False, use_aggregate_conditions=False, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") snuba_filter = get_filter(query, params) # TODO(mark) Refactor the need for this translation shim once all of # discover is using this module. Remember to update all the functions # in this module. snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, "orderby": orderby, "having": [], } if use_aggregate_conditions: snuba_args["having"] = snuba_filter.having snuba_args.update( resolve_field_list(selected_columns, snuba_args, auto_fields=auto_fields)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_args["conditions"].extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_args, translated_columns = resolve_discover_aliases(snuba_args) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_args.get("having"): found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_args.get("aggregations")) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column." .format(having_clause[0])) result = raw_query( start=snuba_args.get("start"), end=snuba_args.get("end"), groupby=snuba_args.get("groupby"), conditions=snuba_args.get("conditions"), aggregations=snuba_args.get("aggregations"), selected_columns=snuba_args.get("selected_columns"), filter_keys=snuba_args.get("filter_keys"), having=snuba_args.get("having"), orderby=snuba_args.get("orderby"), dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) return transform_results(result, translated_columns, snuba_args)
def get_pagination_ids(event, query, params, organization, reference_event=None, referrer=None): """ High-level API for getting pagination data for an event + filter The provided event is used as a reference event to find events that are older and newer than the current one. event (Event) The event to find related events for. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment, reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. """ # TODO(evanh): This can be removed once we migrate the frontend / saved queries # to use the new function values query = transform_deprecated_functions_in_query(query) snuba_filter = get_filter(query, params) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_filter.conditions.extend(ref_conditions) result = { "next": eventstore.get_next_event_id(event, filter=snuba_filter), "previous": eventstore.get_prev_event_id(event, filter=snuba_filter), "latest": eventstore.get_latest_event_id(event, filter=snuba_filter), "oldest": eventstore.get_earliest_event_id(event, filter=snuba_filter), } # translate project ids to slugs project_ids = set([tuple[0] for tuple in result.values() if tuple]) project_slugs = {} projects = Project.objects.filter(id__in=list(project_ids), organization=organization, status=ProjectStatus.VISIBLE).values( "id", "slug") for project in projects: project_slugs[project["id"]] = project["slug"] def into_pagination_record(project_slug_event_id): if not project_slug_event_id: return None project_id = int(project_slug_event_id[0]) return "{}:{}".format(project_slugs[project_id], project_slug_event_id[1]) for key, value in result.items(): result[key] = into_pagination_record(value) return PaginationResult(**result)
def prepare_discover_query( selected_columns, query, params, orderby=None, auto_fields=False, auto_aggregations=False, use_aggregate_conditions=False, conditions=None, functions_acl=None, ): with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: assert ( not auto_aggregations ), "Auto aggregations cannot be used without enabling aggregate conditions" snuba_filter.having = [] with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): if orderby is not None: orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby] snuba_filter.orderby = [get_function_alias(o) for o in orderby] resolved_fields = resolve_field_list( selected_columns, snuba_filter, auto_fields=auto_fields, auto_aggregations=auto_aggregations, functions_acl=functions_acl, ) snuba_filter.update_with(resolved_fields) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. error_extra = ", and could not be automatically added" if auto_aggregations else "" if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) # Only need to iterate on arg[1] if its a list elif isinstance(arg[1], (list, tuple)): alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( "Aggregate(s) {} used in a condition but are not in the selected columns{}." .format( ", ".join(conditions_not_in_aggregations), error_extra, )) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( "Aggregate {} used in a condition but is not a selected column{}." .format( having_clause[0], error_extra, )) if conditions is not None: snuba_filter.conditions.extend(conditions) return PreparedQuery(snuba_filter, translated_columns, resolved_fields)
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, referrer=None, auto_fields=False, use_aggregate_conditions=False, conditions=None, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. conditions (Sequence[any]) List of conditions that are passed directly to snuba without any additional processing. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") else: # We clobber this value throughout this code, so copy the value selected_columns = selected_columns[:] with sentry_sdk.start_span(op="discover.discover", description="query.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: snuba_filter.having = [] # We need to run a separate query to be able to properly bucket the values for the histogram # Do that here, and format the bucket number in to the columns before passing it through # to event search. idx = 0 function_translations = {} for col in selected_columns: if col.startswith("histogram("): with sentry_sdk.start_span( op="discover.discover", description="query.histogram_calculation") as span: span.set_data("histogram", col) histogram_column = find_histogram_buckets( col, params, snuba_filter.conditions) selected_columns[idx] = histogram_column snuba_name = get_function_alias(histogram_column) sentry_name = get_function_alias(col) function_translations[snuba_name] = sentry_name # Since we're completely renaming the histogram function, we need to also check if we are # ordering by the histogram values, and change that. if orderby is not None: orderby = list(orderby) if isinstance( orderby, (list, tuple)) else [orderby] for i, ordering in enumerate(orderby): if sentry_name == ordering.lstrip("-"): ordering = "{}{}".format( "-" if ordering.startswith("-") else "", snuba_name) orderby[i] = ordering break idx += 1 with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"): if orderby is not None: orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby] snuba_filter.orderby = [get_function_alias(o) for o in orderby] snuba_filter.update_with( resolve_field_list(selected_columns, snuba_filter, auto_fields=auto_fields)) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter, function_translations) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure # any referenced functions are in the aggregations. if isinstance(having_clause[0], (list, tuple)): # Functions are of the form [fn, [args]] args_to_check = [[having_clause[0]]] conditions_not_in_aggregations = [] while len(args_to_check) > 0: args = args_to_check.pop() for arg in args: if arg[0] in [SNUBA_AND, SNUBA_OR]: args_to_check.extend(arg[1]) else: alias = arg[1][0] found = any( alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: conditions_not_in_aggregations.append(alias) if len(conditions_not_in_aggregations) > 0: raise InvalidSearchQuery( u"Aggregate(s) {} used in a condition but are not in the selected columns." .format(", ".join(conditions_not_in_aggregations))) else: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column." .format(having_clause[0])) if conditions is not None: snuba_filter.conditions.extend(conditions) with sentry_sdk.start_span(op="discover.discover", description="query.snuba_query"): result = raw_query( start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, filter_keys=snuba_filter.filter_keys, having=snuba_filter.having, orderby=snuba_filter.orderby, dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) with sentry_sdk.start_span(op="discover.discover", description="query.transform_results") as span: span.set_data("result_count", len(result.get("data", []))) return transform_results(result, translated_columns, snuba_filter, selected_columns)
def query( selected_columns, query, params, orderby=None, offset=None, limit=50, reference_event=None, referrer=None, auto_fields=False, use_aggregate_conditions=False, conditions=None, ): """ High-level API for doing arbitrary user queries against events. This function operates on the Discover public event schema and virtual fields/aggregate functions for selected columns and conditions are supported through this function. The resulting list will have all internal field names mapped back into their public schema names. selected_columns (Sequence[str]) List of public aliases to fetch. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment orderby (None|str|Sequence[str]) The field to order results by. offset (None|int) The record offset to read. limit (int) The number of records to fetch. reference_event (ReferenceEvent) A reference event object. Used to generate additional conditions based on the provided reference. referrer (str|None) A referrer string to help locate the origin of this query. auto_fields (bool) Set to true to have project + eventid fields automatically added. conditions (Sequence[any]) List of conditions that are passed directly to snuba without any additional processing. """ if not selected_columns: raise InvalidSearchQuery("No columns selected") # TODO(evanh): These can be removed once we migrate the frontend / saved queries # to use the new function values selected_columns, function_translations = transform_deprecated_functions_in_columns( selected_columns) query = transform_deprecated_functions_in_query(query) snuba_filter = get_filter(query, params) if not use_aggregate_conditions: snuba_filter.having = [] # We need to run a separate query to be able to properly bucket the values for the histogram # Do that here, and format the bucket number in to the columns before passing it through # to event search. idx = 0 for col in selected_columns: if col.startswith("histogram("): histogram_column = find_histogram_buckets(col, params, snuba_filter.conditions) selected_columns[idx] = histogram_column function_translations[get_function_alias( histogram_column)] = get_function_alias(col) break idx += 1 # Check to see if we are ordering by any functions and convert the orderby to be the correct alias. if orderby: orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby] new_orderby = [] for ordering in orderby: is_reversed = ordering.startswith("-") ordering = ordering.lstrip("-") for snuba_name, sentry_name in six.iteritems( function_translations): if sentry_name == ordering: ordering = snuba_name break ordering = "{}{}".format("-" if is_reversed else "", ordering) new_orderby.append(ordering) snuba_filter.orderby = new_orderby snuba_filter.update_with( resolve_field_list(selected_columns, snuba_filter, auto_fields=auto_fields)) if reference_event: ref_conditions = create_reference_event_conditions(reference_event) if ref_conditions: snuba_filter.conditions.extend(ref_conditions) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter, function_translations) # Make sure that any aggregate conditions are also in the selected columns for having_clause in snuba_filter.having: found = any(having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations) if not found: raise InvalidSearchQuery( u"Aggregate {} used in a condition but is not a selected column." .format(having_clause[0])) if conditions is not None: snuba_filter.conditions.extend(conditions) result = raw_query( start=snuba_filter.start, end=snuba_filter.end, groupby=snuba_filter.groupby, conditions=snuba_filter.conditions, aggregations=snuba_filter.aggregations, selected_columns=snuba_filter.selected_columns, filter_keys=snuba_filter.filter_keys, having=snuba_filter.having, orderby=snuba_filter.orderby, dataset=Dataset.Discover, limit=limit, offset=offset, referrer=referrer, ) return transform_results(result, translated_columns, snuba_filter, selected_columns)
def calculate_incident_start(query, projects, groups): """ Attempts to automatically calculate the date that an incident began at based on the events related to the incident. """ params = {} if groups: params["group_ids"] = [g.id for g in groups] end = max(g.last_seen for g in groups) + timedelta(seconds=1) else: end = timezone.now() params["start"] = end - INCIDENT_START_PERIOD params["end"] = end if projects: params["project_id"] = [p.id for p in projects] filter = get_filter(query, params) rollup = int(INCIDENT_START_ROLLUP.total_seconds()) result = raw_query( aggregations=[("count()", "", "count"), ("min", "timestamp", "first_seen")], orderby="time", groupby=["time"], rollup=rollup, referrer="incidents.calculate_incident_start", limit=10000, start=filter.start, end=filter.end, conditions=filter.conditions, filter_keys=filter.filter_keys, )["data"] # TODO: Start could be the period before the first period we find result = zerofill(result, params["start"], params["end"], rollup, "time") # We want to linearly scale scores from 100% value at the most recent to # 50% at the oldest. This gives a bias towards newer results. negative_weight = (1.0 / len(result)) / 2 multiplier = 1.0 cur_spike_max_count = -1 cur_spike_start = None cur_spike_end = None max_height = 0 incident_start = None cur_height = 0 prev_count = 0 def get_row_first_seen(row, default=None): first_seen = default if "first_seen" in row: first_seen = parse_date(row["first_seen"]).replace(tzinfo=pytz.utc) return first_seen def calculate_start(spike_start, spike_end): """ We arbitrarily choose a date about 1/3 into the incident period. We could potentially improve this if we want by analyzing the period in more detail and choosing a date that most closely fits with being 1/3 up the spike. """ spike_length = spike_end - spike_start return spike_start + (spike_length / 3) for row in reversed(result): cur_count = row.get("count", 0) if cur_count < prev_count or cur_count > 0 and cur_count == prev_count: cur_height = cur_spike_max_count - cur_count elif cur_count > 0 or prev_count > 0 or cur_height > 0: # Now we've got the height of the current spike, compare it to the # current max. We decrease the value by `multiplier` so that we # favour newer results cur_height *= multiplier if cur_height > max_height: # If we detect that we have a new highest peak, then set a new # incident start date incident_start = calculate_start(cur_spike_start, cur_spike_end) max_height = cur_height cur_height = 0 cur_spike_max_count = cur_count cur_spike_end = get_row_first_seen(row) # We attempt to get the first_seen value from the row here. If the row # doesn't have it (because it's a zerofilled row), then just use the # previous value. This allows us to have the start of a spike always be # a bucket that contains at least one element. cur_spike_start = get_row_first_seen(row, cur_spike_start) prev_count = cur_count multiplier -= negative_weight if (cur_height > max_height or not incident_start) and cur_spike_start: incident_start = calculate_start(cur_spike_start, cur_spike_end) if not incident_start: incident_start = timezone.now() return incident_start
def validate_alert_rule_query(query): # TODO: We should add more validation here to reject queries that include # fields that are invalid in alert rules. For now this will just make sure # the query parses correctly. get_filter(query)
def get_performance_facets( query, params, orderby=None, aggregate_column="duration", aggregate_function="avg", limit=20, referrer=None, ): """ High-level API for getting 'facet map' results for performance data Performance facets are high frequency tags and the aggregate duration of their most frequent values query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ with sentry_sdk.start_span(op="discover.discover", description="facets.filter_transform") as span: span.set_data("query", query) snuba_filter = get_filter(query, params) # Resolve the public aliases into the discover dataset names. snuba_filter, translated_columns = resolve_discover_aliases( snuba_filter) # Exclude tracing tags as they are noisy and generally not helpful. # TODO(markus): Tracing tags are no longer written but may still reside in DB. excluded_tags = [ "tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span", "project"] ] # Sampling keys for multi-project results as we don't need accuracy # with that much data. sample = len(snuba_filter.filter_keys["project_id"]) > 2 with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"): # Get the most relevant tag keys key_names = raw_query( aggregations=[["count", None, "count"]], start=snuba_filter.start, end=snuba_filter.end, conditions=snuba_filter.conditions, filter_keys=snuba_filter.filter_keys, orderby=["-count", "tags_key"], groupby="tags_key", # TODO(Kevan): Check using having vs where before mainlining having=[excluded_tags], dataset=Dataset.Discover, limit=limit, referrer=referrer, turbo=sample, ) top_tags = [r["tags_key"] for r in key_names["data"]] if not top_tags: return [] results = [] snuba_filter.conditions.append([aggregate_column, "IS NOT NULL", None]) # Only enable sampling if over 10000 values sampling_enabled = key_names["data"][0]["count"] > 10000 options_sample_rate = options.get( "discover2.tags_performance_facet_sample_rate") or 0.1 sample_rate = options_sample_rate if sampling_enabled else None max_aggregate_tags = 20 aggregate_tags = [] for i, tag in enumerate(top_tags): if i >= len(top_tags) - max_aggregate_tags: aggregate_tags.append(tag) if orderby is None: orderby = [] if aggregate_tags: with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"): conditions = snuba_filter.conditions conditions.append(["tags_key", "IN", aggregate_tags]) tag_values = raw_query( aggregations=[ [aggregate_function, aggregate_column, "aggregate"], ["count", None, "count"], ], conditions=conditions, start=snuba_filter.start, end=snuba_filter.end, filter_keys=snuba_filter.filter_keys, orderby=orderby + ["tags_key"], groupby=["tags_key", "tags_value"], dataset=Dataset.Discover, referrer=referrer, sample=sample_rate, turbo=sample_rate is not None, limitby=[TOP_VALUES_DEFAULT_LIMIT, "tags_key"], ) results.extend([ PerformanceFacetResult(r["tags_key"], r["tags_value"], float(r["aggregate"]), int(r["count"])) for r in tag_values["data"] ]) return results
def get_facets(query, params, limit=10, referrer=None): """ High-level API for getting 'facet map' results. Facets are high frequency tags and attribute results that can be used to further refine user queries. When many projects are requested sampling will be enabled to help keep response times low. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ snuba_filter = get_filter(query, params) # TODO(mark) Refactor the need for this translation shim. snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, } # Resolve the public aliases into the discover dataset names. snuba_args, translated_columns = resolve_discover_aliases(snuba_args) # Force sampling for multi-project results as we don't need accuracy # with that much data. sample = len(snuba_filter.filter_keys["project_id"]) > 2 # Exclude tracing tags as they are noisy and generally not helpful. excluded_tags = [ "tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span"] ] # Get the most frequent tag keys, enable sampling # as we don't need accuracy here. key_names = raw_query( aggregations=[["count", None, "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), orderby=["-count", "tags_key"], groupby="tags_key", having=[excluded_tags], dataset=Dataset.Discover, limit=limit, referrer=referrer, turbo=sample, ) top_tags = [r["tags_key"] for r in key_names["data"]] if not top_tags: return [] fetch_projects = False if len(params.get("project_id", [])) > 1: if len(top_tags) == limit: top_tags.pop() fetch_projects = True results = [] if fetch_projects: project_values = raw_query( aggregations=[["count", None, "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), groupby="project_id", orderby="-count", dataset=Dataset.Discover, referrer=referrer, ) results.extend([ FacetResult("project", r["project_id"], r["count"]) for r in project_values["data"] ]) # Get tag counts for our top tags. Fetching them individually # allows snuba to leverage promoted tags better and enables us to get # the value count we want. for tag_name in top_tags: tag = u"tags[{}]".format(tag_name) tag_values = raw_query( aggregations=[["count", None, "count"]], conditions=snuba_args.get("conditions"), start=snuba_args.get("start"), end=snuba_args.get("end"), filter_keys=snuba_args.get("filter_keys"), orderby=["-count"], groupby=[tag], limit=TOP_VALUES_DEFAULT_LIMIT, dataset=Dataset.Discover, referrer=referrer, ) results.extend([ FacetResult(tag_name, r[tag], int(r["count"])) for r in tag_values["data"] ]) return results
def validate_conditions(self, conditions): try: get_filter(conditions) except InvalidSearchQuery as err: raise serializers.ValidationError("Invalid conditions: {}".format(err)) return conditions
def validate(self, data): organization = self.context["organization"] query_info = data["query_info"] # Validate the project field, if provided # A PermissionDenied error will be raised in `get_projects_by_id` if the request is invalid project_query = query_info.get("project") if project_query: get_projects_by_id = self.context["get_projects_by_id"] # Coerce the query into a set if isinstance(project_query, list): projects = get_projects_by_id(set(map(int, project_query))) else: projects = get_projects_by_id({int(project_query)}) query_info["project"] = [project.id for project in projects] # Discover Pre-processing if data["query_type"] == ExportQueryType.DISCOVER_STR: # coerce the fields into a list as needed fields = query_info.get("field", []) if not isinstance(fields, list): fields = [fields] if len(fields) > MAX_FIELDS: detail = f"You can export up to {MAX_FIELDS} fields at a time. Please delete some and try again." raise serializers.ValidationError(detail) elif len(fields) == 0: raise serializers.ValidationError( "at least one field is required to export") if "query" not in query_info: detail = "query is a required to export, please pass an empty string if you don't want to set one" raise serializers.ValidationError(detail) query_info["field"] = fields if not query_info.get("project"): projects = self.context["get_projects"]() query_info["project"] = [project.id for project in projects] # make sure to fix the export start/end times to ensure consistent results try: start, end = get_date_range_from_params(query_info) except InvalidParams as e: sentry_sdk.set_tag("query.error_reason", "Invalid date params") raise serializers.ValidationError(str(e)) if "statsPeriod" in query_info: del query_info["statsPeriod"] if "statsPeriodStart" in query_info: del query_info["statsPeriodStart"] if "statsPeriodEnd" in query_info: del query_info["statsPeriodEnd"] query_info["start"] = start.isoformat() query_info["end"] = end.isoformat() # validate the query string by trying to parse it processor = DiscoverProcessor( discover_query=query_info, organization_id=organization.id, ) try: snuba_filter = get_filter(query_info["query"], processor.params) resolve_field_list( fields.copy(), snuba_filter, auto_fields=True, auto_aggregations=True, ) except InvalidSearchQuery as err: raise serializers.ValidationError(str(err)) return data
def get_facets(query, params, limit=20, referrer=None): """ High-level API for getting 'facet map' results. Facets are high frequency tags and attribute results that can be used to further refine user queries. When many projects are requested sampling will be enabled to help keep response times low. query (str) Filter query string to create conditions from. params (Dict[str, str]) Filtering parameters with start, end, project_id, environment limit (int) The number of records to fetch. referrer (str|None) A referrer string to help locate the origin of this query. Returns Sequence[FacetResult] """ snuba_filter = get_filter(query, params) # TODO(mark) Refactor the need for this translation shim. snuba_args = { "start": snuba_filter.start, "end": snuba_filter.end, "conditions": snuba_filter.conditions, "filter_keys": snuba_filter.filter_keys, } # Resolve the public aliases into the discover dataset names. snuba_args, translated_columns = resolve_discover_aliases(snuba_args) # Force sampling for more than 9 projects. 9 was chosen arbitrarily. sample = len(snuba_filter.filter_keys["project_id"]) > 9 # Exclude tracing tags as they are noisy and generally not helpful. conditions = snuba_args.get("conditions", []) conditions.append( ["tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span"]]) # Get the most frequent tag keys, enable sampling # as we don't need accuracy here. key_names = raw_query( aggregations=[["count", None, "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), orderby=["-count", "tags_key"], groupby="tags_key", dataset=Dataset.Discover, limit=limit, referrer=referrer, turbo=sample, ) top_tags = [r["tags_key"] for r in key_names["data"]] if not top_tags: return [] fetch_projects = False if len(params.get("project_id", [])) > 1: if len(top_tags) == limit: top_tags.pop() fetch_projects = True results = [] if fetch_projects: project_values = raw_query( aggregations=[["uniq", "event_id", "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), groupby="project_id", orderby="-count", dataset=Dataset.Discover, referrer=referrer, ) results.extend([ FacetResult("project", r["project_id"], r["count"]) for r in project_values["data"] ]) # Environment is a special case because of the "" value which is stored as null # in the environment column but not in the tag arrays. if "environment" in top_tags: top_tags.remove("environment") environment_values = raw_query( aggregations=[["uniq", "event_id", "count"]], start=snuba_args.get("start"), end=snuba_args.get("end"), conditions=snuba_args.get("conditions"), filter_keys=snuba_args.get("filter_keys"), groupby="environment", orderby=["-count", "environment"], dataset=Dataset.Discover, referrer=referrer, ) results.extend([ FacetResult("environment", r["environment"], r["count"]) for r in environment_values["data"] ]) # Get tag counts for our top tags. conditions.append(["tags_key", "IN", top_tags]) tag_values = raw_query( aggregations=[["count", None, "count"]], conditions=conditions, start=snuba_args.get("start"), end=snuba_args.get("end"), filter_keys=snuba_args.get("filter_keys"), orderby=["tags_key", "-count"], groupby=["tags_key", "tags_value"], dataset=Dataset.Discover, referrer=referrer, ) results.extend([ FacetResult(r["tags_key"], r["tags_value"], int(r["count"])) for r in tag_values["data"] ]) return results