Exemple #1
0
def transform_deprecated_functions_in_orderby(orderby):
    if not orderby:
        return

    orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby]
    new_orderby = []
    for order in orderby:
        has_negative = False
        column = order
        if order.startswith("-"):
            has_negative = True
            column = order.strip("-")

        new_column = column
        if column in OLD_FUNCTIONS_TO_NEW:
            new_column = get_function_alias(OLD_FUNCTIONS_TO_NEW[column])
        elif column.replace("()", "") in OLD_FUNCTIONS_TO_NEW:
            new_column = get_function_alias(OLD_FUNCTIONS_TO_NEW[column.replace("()", "")])

        if has_negative:
            new_column = "-" + new_column

        new_orderby.append(new_column)

    return new_orderby
Exemple #2
0
def find_measurements_min_max(measurements, min_value, max_value, user_query,
                              params):
    """
    Find the min/max value of the specified measurements. If either min/max is already
    specified, it will be used and not queried for.

    :param [str] measurements: The list of measurements for which you want to
        generate the histograms for.
    :param float min_value: The minimum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param float max_value: The maximum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param str user_query: Filter query string to create conditions from.
    :param {str: str} params: Filtering parameters with start, end, project_id, environment
    """

    if min_value is not None and max_value is not None:
        return min_value, max_value

    min_columns, max_columns = [], []
    for measurement in measurements:
        if min_value is None:
            min_columns.append("min(measurements.{})".format(measurement))
        if max_value is None:
            max_columns.append("max(measurements.{})".format(measurement))

    results = query(
        selected_columns=min_columns + max_columns,
        query=user_query,
        params=params,
        limit=1,
        referrer="api.organization-events-measurements-min-max",
        auto_fields=True,
        use_aggregate_conditions=True,
    )

    data = results.get("data")

    # there should be exactly 1 row in the results, but if something went wrong here,
    # we force the min/max to be None to coerce an empty histogram
    if data is None or len(data) != 1:
        return None, None

    row = data[0]

    if min_value is None:
        min_values = [
            row[get_function_alias(column)] for column in min_columns
        ]
        min_values = list(filter(lambda v: v is not None, min_values))
        min_value = min(min_values) if min_values else None

    if max_value is None:
        max_values = [
            row[get_function_alias(column)] for column in max_columns
        ]
        max_values = list(filter(lambda v: v is not None, max_values))
        max_value = max(max_values) if max_values else None

    return min_value, max_value
Exemple #3
0
    def get(self, request, organization):
        """ Find the event id with the closest value to an aggregate for a given query """
        if not self.has_feature(organization, request):
            return Response(status=404)

        with sentry_sdk.start_span(op="discover.endpoint",
                                   description="filter_params") as span:
            span.set_data("organization", organization)
            try:
                params = self.get_filter_params(request, organization)
            except NoProjects:
                return Response(status=404)
            params = self.quantize_date_params(request, params)

        # Assumption is that users will want the 50th percentile
        baseline_function = request.GET.get("baselineFunction", "p50()")
        # If the baseline was calculated already save ourselves a query
        baseline_value = request.GET.get("baselineValue")
        baseline_alias = get_function_alias(baseline_function)

        with self.handle_query_errors():
            if baseline_value is None:
                result = discover.query(
                    selected_columns=[baseline_function],
                    params=params,
                    query=request.GET.get("query"),
                    limit=1,
                    referrer="api.transaction-baseline.get_value",
                )
                baseline_value = result["data"][0].get(
                    baseline_alias) if "data" in result else None
                if baseline_value is None:
                    return Response(status=404)

            delta_column = "absolute_delta(transaction.duration,{})".format(
                baseline_value)

            result = discover.query(
                selected_columns=[
                    "project",
                    "timestamp",
                    "id",
                    "transaction.duration",
                    delta_column,
                ],
                # Find the most recent transaction that's closest to the baseline value
                # id as the last item for consistent results
                orderby=[get_function_alias(delta_column), "-timestamp", "id"],
                params=params,
                query=request.GET.get("query"),
                limit=1,
                referrer="api.transaction-baseline.get_id",
            )
            if len(result["data"]) == 0:
                return Response(status=404)

        baseline_data = result["data"][0]
        baseline_data[baseline_alias] = baseline_value
        return Response(baseline_data)
Exemple #4
0
def normalize_measurements_histogram(measurements, num_buckets, key_col, histogram_params, results):
    """
    Normalizes the histogram results by renaming the columns to key and bin
    and make sure to zerofill any missing values.

    :param [str] measurements: The list of measurements for which you want to
        generate the histograms for.
    :param int num_buckets: The number of buckets the histogram should contain.
    :param str key_col: The column of the key name.
    :param HistogramParms histogram_params: The histogram parameters used.
    :param any results: The results from the histogram query that may be missing
        bins and needs to be normalized.
    """

    measurements = sorted(measurements)
    key_name = get_function_alias(key_col)
    bin_name = get_function_alias(get_measurements_histogram_col(histogram_params))

    # adjust the meta for the renamed columns
    meta = results["meta"]
    new_meta = []

    meta_map = {key_name: "key", bin_name: "bin"}
    for col in meta:
        new_meta.append({"type": col["type"], "name": meta_map.get(col["name"], col["name"])})

    results["meta"] = new_meta

    # zerofill and rename the columns while making sure to adjust for precision
    data = results["data"]
    new_data = []

    bucket_maps = {m: {} for m in measurements}
    for row in data:
        measurement = row[key_name]
        # we expect the bin the be an integer, this is because all floating
        # point values are rounded during the calculation
        bucket = int(row[bin_name])
        # ignore unexpected measurements
        if measurement in bucket_maps:
            bucket_maps[measurement][bucket] = row["count"]

    for i in range(num_buckets):
        bucket = histogram_params.start_offset + histogram_params.bucket_size * i
        for measurement in measurements:
            # we want to rename the columns here
            row = {
                "key": measurement,
                "bin": bucket,
                "count": bucket_maps[measurement].get(bucket, 0),
            }
            # make sure to adjust for the precision if necessary
            if histogram_params.multiplier > 1:
                row["bin"] /= float(histogram_params.multiplier)
            new_data.append(row)

    results["data"] = new_data

    return results
Exemple #5
0
def normalize_histogram_results(fields, key_column, histogram_params, results,
                                array_column):
    """
    Normalizes the histogram results by renaming the columns to key and bin
    and make sure to zerofill any missing values.

    :param [str] fields: The list of fields for which you want to generate the
        histograms for.
    :param str key_column: The column of the key name.
    :param HistogramParms histogram_params: The histogram parameters used.
    :param any results: The results from the histogram query that may be missing
        bins and needs to be normalized.
    :param str array_column: Array column prefix
    """

    # `key_name` is only used when generating a multi histogram of measurement values.
    # It contains the name of the corresponding measurement for that row.
    key_name = None if key_column is None else get_function_alias(key_column)
    histogram_column = get_histogram_column(fields, key_column,
                                            histogram_params, array_column)
    bin_name = get_function_alias(histogram_column)

    # zerofill and rename the columns while making sure to adjust for precision
    bucket_maps = {field: {} for field in fields}
    for row in results["data"]:
        # Fall back to the first field name if there is no `key_name`,
        # otherwise, this is an array value name and format it as such.
        key = (
            fields[0] if key_name is None else
            f"{get_array_column_alias(array_column)}.{get_array_column_field(array_column, row[key_name])}"
        )
        # we expect the bin the be an integer, this is because all floating
        # point values are rounded during the calculation
        bucket = int(row[bin_name])
        # ignore unexpected keys
        if key in bucket_maps:
            bucket_maps[key][bucket] = row["count"]

    new_data = {field: [] for field in fields}
    for i in range(histogram_params.num_buckets):
        bucket = histogram_params.start_offset + histogram_params.bucket_size * i
        for field in fields:
            row = {
                "bin": bucket,
                "count": bucket_maps[field].get(bucket, 0),
            }
            # make sure to adjust for the precision if necessary
            if histogram_params.multiplier > 1:
                row["bin"] /= float(histogram_params.multiplier)
            new_data[field].append(row)

    return new_data
Exemple #6
0
def transform_deprecated_functions_in_columns(columns):
    new_list = []
    translations = {}
    for column in columns:
        if column in OLD_FUNCTIONS_TO_NEW:
            new_column = OLD_FUNCTIONS_TO_NEW[column]
            translations[get_function_alias(new_column)] = column
            new_list.append(new_column)
        elif column.replace("()", "") in OLD_FUNCTIONS_TO_NEW:
            new_column = OLD_FUNCTIONS_TO_NEW[column.replace("()", "")]
            translations[get_function_alias(new_column)] = column.replace("()", "")
            new_list.append(new_column)
        else:
            new_list.append(column)

    return new_list, translations
Exemple #7
0
    def validate(self, data):
        if not data.get("id"):
            keys = set(data.keys())
            if self.required_for_create - keys:
                raise serializers.ValidationError({
                    "fields":
                    "fields are required during creation.",
                    "conditions":
                    "conditions are required during creation.",
                })

        # Validate the query that would be created when run.
        conditions = self._get_attr(data, "conditions", "")
        fields = self._get_attr(data, "fields", [])
        orderby = self._get_attr(data, "orderby", "")
        try:
            snuba_filter = get_filter(conditions)
        except InvalidSearchQuery as err:
            raise serializers.ValidationError(
                {"conditions": f"Invalid conditions: {err}"})

        if orderby:
            snuba_filter.orderby = get_function_alias(orderby)
        try:
            resolve_field_list(fields, snuba_filter)
        except InvalidSearchQuery as err:
            raise serializers.ValidationError(
                {"fields": f"Invalid fields: {err}"})
        return data
Exemple #8
0
 def serialize_multiple_axis(self, serializer, event_result, columns,
                             query_columns):
     # Return with requested yAxis as the key
     return {
         column: serializer.serialize(event_result,
                                      get_function_alias(query_column))
         for column, query_column in zip(columns, query_columns)
     }
Exemple #9
0
    def get_event_stats_data(self,
                             request,
                             organization,
                             get_event_stats,
                             top_events=False):
        try:
            columns = request.GET.getlist("yAxis", ["count()"])
            query = request.GET.get("query")
            params = self.get_filter_params(request, organization)
            rollup = get_rollup_from_request(
                request,
                params,
                "1h",
                InvalidSearchQuery(
                    "Your interval and date range would create too many results. "
                    "Use a larger interval, or a smaller date range."),
            )
            # Backwards compatibility for incidents which uses the old
            # column aliases as it straddles both versions of events/discover.
            # We will need these aliases until discover2 flags are enabled for all
            # users.
            column_map = {
                "user_count": "count_unique(user)",
                "event_count": "count()",
                "rpm()": "rpm(%d)" % rollup,
                "rps()": "rps(%d)" % rollup,
            }
            query_columns = [
                column_map.get(column, column) for column in columns
            ]
            reference_event = self.reference_event(request, organization,
                                                   params.get("start"),
                                                   params.get("end"))

            result = get_event_stats(query_columns, query, params, rollup,
                                     reference_event)
        except (discover.InvalidSearchQuery,
                snuba.QueryOutsideRetentionError) as error:
            raise ParseError(detail=six.text_type(error))
        serializer = SnubaTSResultSerializer(organization, None, request.user)

        if top_events:
            results = {}
            for key, event_result in six.iteritems(result):
                if len(query_columns) > 1:
                    results[key] = self.serialize_multiple_axis(
                        serializer, event_result, columns, query_columns)
                else:
                    # Need to get function alias if count is a field, but not the axis
                    results[key] = serializer.serialize(
                        event_result, get_function_alias(query_columns[0]))
            return results
        elif len(query_columns) > 1:
            return self.serialize_multiple_axis(serializer, result, columns,
                                                query_columns)
        else:
            return serializer.serialize(result)
Exemple #10
0
 def serialize_multiple_axis(self, serializer, event_result, columns, query_columns):
     # Return with requested yAxis as the key
     result = {
         columns[index]: serializer.serialize(
             event_result, get_function_alias(query_column), order=index
         )
         for index, query_column in enumerate(query_columns)
     }
     # Set order if multi-axis + top events
     if "order" in event_result.data:
         result["order"] = event_result.data["order"]
     return result
Exemple #11
0
 def __init__(self, organization_id, discover_query):
     self.projects = self.get_projects(organization_id, discover_query)
     self.start, self.end = get_date_range_from_params(discover_query)
     self.params = {
         "organization_id": organization_id,
         "project_id": [project.id for project in self.projects],
         "start": self.start,
         "end": self.end,
     }
     self.header_fields = map(lambda x: get_function_alias(x),
                              discover_query["field"])
     self.data_fn = self.get_data_fn(fields=discover_query["field"],
                                     query=discover_query["query"],
                                     params=self.params)
    def get(self, request, organization):
        if not features.has("organizations:discover-basic",
                            organization,
                            actor=request.user):
            return self.get_v1_results(request, organization)

        try:
            columns = request.GET.getlist("yAxis", ["count()"])
            params = self.get_filter_params(request, organization)
            rollup = self.get_rollup(request, params)
            # Backwards compatibility for incidents which uses the old
            # column aliases as it straddles both versions of events/discover.
            # We will need these aliases until discover2 flags are enabled for all
            # users.
            column_map = {
                "user_count": "count_unique(user)",
                "event_count": "count()",
                "rpm()": "rpm(%d)" % rollup,
                "rps()": "rps(%d)" % rollup,
            }
            query_columns = [
                column_map.get(column, column) for column in columns
            ]

            result = discover.timeseries_query(
                selected_columns=query_columns,
                query=request.GET.get("query"),
                params=params,
                rollup=rollup,
                reference_event=self.reference_event(request, organization,
                                                     params.get("start"),
                                                     params.get("end")),
                referrer="api.organization-event-stats",
            )
        except InvalidSearchQuery as err:
            raise ParseError(detail=six.text_type(err))
        serializer = SnubaTSResultSerializer(organization, None, request.user)
        if len(columns) > 1:
            # Return with requested yAxis as the key
            data = {
                column: serializer.serialize(result,
                                             get_function_alias(query_column))
                for column, query_column in zip(columns, query_columns)
            }
        else:
            data = serializer.serialize(result)
        return Response(data, status=200)
Exemple #13
0
 def __init__(self, organization_id, discover_query):
     self.projects = self.get_projects(organization_id, discover_query)
     self.environments = self.get_environments(organization_id,
                                               discover_query)
     self.start, self.end = get_date_range_from_params(discover_query)
     self.params = {
         "organization_id": organization_id,
         "project_id": [project.id for project in self.projects],
         "start": self.start,
         "end": self.end,
     }
     # make sure to only include environment if any are given
     # an empty list DOES NOT work
     if self.environments:
         self.params["environment"] = self.environments
     self.header_fields = map(lambda x: get_function_alias(x),
                              discover_query["field"])
     self.data_fn = self.get_data_fn(fields=discover_query["field"],
                                     query=discover_query["query"],
                                     params=self.params)
    def get_event_stats_data(self, request, organization, get_event_stats):
        try:
            columns = request.GET.getlist("yAxis", ["count()"])
            query = request.GET.get("query")
            params = self.get_filter_params(request, organization)
            rollup = get_rollup_from_request(
                request,
                params,
                "1h",
                InvalidSearchQuery(
                    "Your interval and date range would create too many results. "
                    "Use a larger interval, or a smaller date range."
                ),
            )
            # Backwards compatibility for incidents which uses the old
            # column aliases as it straddles both versions of events/discover.
            # We will need these aliases until discover2 flags are enabled for all
            # users.
            column_map = {
                "user_count": "count_unique(user)",
                "event_count": "count()",
                "rpm()": "rpm(%d)" % rollup,
                "rps()": "rps(%d)" % rollup,
            }
            query_columns = [column_map.get(column, column) for column in columns]
            reference_event = self.reference_event(
                request, organization, params.get("start"), params.get("end")
            )

            result = get_event_stats(query_columns, query, params, rollup, reference_event)
        except InvalidSearchQuery as err:
            raise ParseError(detail=six.text_type(err))
        serializer = SnubaTSResultSerializer(organization, None, request.user)
        if len(columns) > 1:
            # Return with requested yAxis as the key
            return {
                column: serializer.serialize(result, get_function_alias(query_column))
                for column, query_column in zip(columns, query_columns)
            }
        else:
            return serializer.serialize(result)
Exemple #15
0
    def validate(self, data):
        if not data.get("id"):
            keys = set(data.keys())
            if self.required_for_create - keys:
                raise serializers.ValidationError({
                    "fields":
                    "fields are required during creation.",
                    "conditions":
                    "conditions are required during creation.",
                })

        # Validate the query that would be created when run.
        conditions = self._get_attr(data, "conditions", "")
        fields = self._get_attr(data, "fields", []).copy()
        orderby = self._get_attr(data, "orderby", "")
        try:
            # When using the eps/epm functions, they require an interval argument
            # or to provide the start/end so that the interval can be computed.
            # This uses a hard coded start/end to ensure the validation succeeds
            # since the values themselves don't matter.
            params = {
                "start": datetime.now() - timedelta(days=1),
                "end": datetime.now(),
                "project_id": [p.id for p in self.context.get("projects")],
            }

            snuba_filter = get_filter(conditions, params=params)
        except InvalidSearchQuery as err:
            raise serializers.ValidationError(
                {"conditions": f"Invalid conditions: {err}"})

        if orderby:
            snuba_filter.orderby = get_function_alias(orderby)
        try:
            resolve_field_list(fields, snuba_filter)
        except InvalidSearchQuery as err:
            raise serializers.ValidationError(
                {"fields": f"Invalid fields: {err}"})
        return data
Exemple #16
0
def find_histogram_min_max(fields,
                           min_value,
                           max_value,
                           user_query,
                           params,
                           data_filter=None):
    """
    Find the min/max value of the specified fields. If either min/max is already
    specified, it will be used and not queried for.

    :param [str] fields: The list of fields for which you want to generate the histograms for.
    :param float min_value: The minimum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param float max_value: The maximum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param str user_query: Filter query string to create conditions from.
    :param {str: str} params: Filtering parameters with start, end, project_id, environment
    :param str data_filter: Indicate the filter strategy to be applied to the data.
    """

    if min_value is not None and max_value is not None:
        return min_value, max_value

    min_columns = []
    max_columns = []
    quartiles = []
    for field in fields:
        if min_value is None:
            min_columns.append(f"min({field})")
        if max_value is None:
            max_columns.append(f"max({field})")
        if data_filter == "exclude_outliers":
            quartiles.append(f"percentile({field}, 0.25)")
            quartiles.append(f"percentile({field}, 0.75)")

    results = query(
        selected_columns=min_columns + max_columns + quartiles,
        query=user_query,
        params=params,
        limit=1,
        referrer="api.organization-events-histogram-min-max",
    )

    data = results.get("data")

    # there should be exactly 1 row in the results, but if something went wrong here,
    # we force the min/max to be None to coerce an empty histogram
    if data is None or len(data) != 1:
        return None, None

    row = data[0]

    if min_value is None:
        min_values = [
            row[get_function_alias(column)] for column in min_columns
        ]
        min_values = list(filter(lambda v: v is not None, min_values))
        min_value = min(min_values) if min_values else None

    if max_value is None:
        max_values = [
            row[get_function_alias(column)] for column in max_columns
        ]
        max_values = list(filter(lambda v: v is not None, max_values))
        max_value = max(max_values) if max_values else None

        fences = []
        if data_filter == "exclude_outliers":
            for field in fields:
                q1_alias = get_function_alias(f"percentile({field}, 0.25)")
                q3_alias = get_function_alias(f"percentile({field}, 0.75)")

                first_quartile = row[q1_alias]
                third_quartile = row[q3_alias]

                if (first_quartile is None or third_quartile is None
                        or math.isnan(first_quartile)
                        or math.isnan(third_quartile)):
                    continue

                interquartile_range = abs(third_quartile - first_quartile)
                upper_outer_fence = third_quartile + 3 * interquartile_range
                fences.append(upper_outer_fence)

        max_fence_value = max(fences) if fences else None

        candidates = [max_fence_value, max_value]
        candidates = list(filter(lambda v: v is not None, candidates))
        max_value = min(candidates) if candidates else None

    return min_value, max_value
Exemple #17
0
def prepare_discover_query(
    selected_columns,
    query,
    params,
    orderby=None,
    auto_fields=False,
    auto_aggregations=False,
    use_aggregate_conditions=False,
    conditions=None,
    functions_acl=None,
):
    with sentry_sdk.start_span(op="discover.discover",
                               description="query.filter_transform") as span:
        span.set_data("query", query)

        snuba_filter = get_filter(query, params)
        if not use_aggregate_conditions:
            assert (
                not auto_aggregations
            ), "Auto aggregations cannot be used without enabling aggregate conditions"
            snuba_filter.having = []

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.field_translations"):
        if orderby is not None:
            orderby = list(orderby) if isinstance(orderby,
                                                  (list,
                                                   tuple)) else [orderby]
            snuba_filter.orderby = [get_function_alias(o) for o in orderby]

        resolved_fields = resolve_field_list(
            selected_columns,
            snuba_filter,
            auto_fields=auto_fields,
            auto_aggregations=auto_aggregations,
            functions_acl=functions_acl,
        )

        snuba_filter.update_with(resolved_fields)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(
            snuba_filter)

        # Make sure that any aggregate conditions are also in the selected columns
        for having_clause in snuba_filter.having:
            # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure
            # any referenced functions are in the aggregations.
            error_extra = ", and could not be automatically added" if auto_aggregations else ""
            if isinstance(having_clause[0], (list, tuple)):
                # Functions are of the form [fn, [args]]
                args_to_check = [[having_clause[0]]]
                conditions_not_in_aggregations = []
                while len(args_to_check) > 0:
                    args = args_to_check.pop()
                    for arg in args:
                        if arg[0] in [SNUBA_AND, SNUBA_OR]:
                            args_to_check.extend(arg[1])
                        # Only need to iterate on arg[1] if its a list
                        elif isinstance(arg[1], (list, tuple)):
                            alias = arg[1][0]
                            found = any(
                                alias == agg_clause[-1]
                                for agg_clause in snuba_filter.aggregations)
                            if not found:
                                conditions_not_in_aggregations.append(alias)

                if len(conditions_not_in_aggregations) > 0:
                    raise InvalidSearchQuery(
                        "Aggregate(s) {} used in a condition but are not in the selected columns{}."
                        .format(
                            ", ".join(conditions_not_in_aggregations),
                            error_extra,
                        ))
            else:
                found = any(having_clause[0] == agg_clause[-1]
                            for agg_clause in snuba_filter.aggregations)
                if not found:
                    raise InvalidSearchQuery(
                        "Aggregate {} used in a condition but is not a selected column{}."
                        .format(
                            having_clause[0],
                            error_extra,
                        ))

        if conditions is not None:
            snuba_filter.conditions.extend(conditions)

    return PreparedQuery(snuba_filter, translated_columns, resolved_fields)
Exemple #18
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    referrer=None,
    auto_fields=False,
    auto_aggregations=False,
    use_aggregate_conditions=False,
    conditions=None,
    functions_acl=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    auto_aggregations (bool) Whether aggregates should be added automatically if they're used
                    in conditions, and there's at least one aggregate already.
    use_aggregate_conditions (bool) Set to true if aggregates conditions should be used at all.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    # We clobber this value throughout this code, so copy the value
    selected_columns = selected_columns[:]

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.filter_transform") as span:
        span.set_data("query", query)

        snuba_filter = get_filter(query, params)
        if not use_aggregate_conditions:
            assert (
                not auto_aggregations
            ), "Auto aggregations cannot be used without enabling aggregate conditions"
            snuba_filter.having = []

    function_translations = {}

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.field_translations"):
        if orderby is not None:
            orderby = list(orderby) if isinstance(orderby,
                                                  (list,
                                                   tuple)) else [orderby]
            snuba_filter.orderby = [get_function_alias(o) for o in orderby]

        resolved_fields = resolve_field_list(
            selected_columns,
            snuba_filter,
            auto_fields=auto_fields,
            auto_aggregations=auto_aggregations,
            functions_acl=functions_acl,
        )

        snuba_filter.update_with(resolved_fields)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(
            snuba_filter, function_translations)

        # Make sure that any aggregate conditions are also in the selected columns
        for having_clause in snuba_filter.having:
            # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure
            # any referenced functions are in the aggregations.
            error_extra = ", and could not be automatically added" if auto_aggregations else ""
            if isinstance(having_clause[0], (list, tuple)):
                # Functions are of the form [fn, [args]]
                args_to_check = [[having_clause[0]]]
                conditions_not_in_aggregations = []
                while len(args_to_check) > 0:
                    args = args_to_check.pop()
                    for arg in args:
                        if arg[0] in [SNUBA_AND, SNUBA_OR]:
                            args_to_check.extend(arg[1])
                        # Only need to iterate on arg[1] if its a list
                        elif isinstance(arg[1], (list, tuple)):
                            alias = arg[1][0]
                            found = any(
                                alias == agg_clause[-1]
                                for agg_clause in snuba_filter.aggregations)
                            if not found:
                                conditions_not_in_aggregations.append(alias)

                if len(conditions_not_in_aggregations) > 0:
                    raise InvalidSearchQuery(
                        "Aggregate(s) {} used in a condition but are not in the selected columns{}."
                        .format(
                            ", ".join(conditions_not_in_aggregations),
                            error_extra,
                        ))
            else:
                found = any(having_clause[0] == agg_clause[-1]
                            for agg_clause in snuba_filter.aggregations)
                if not found:
                    raise InvalidSearchQuery(
                        "Aggregate {} used in a condition but is not a selected column{}."
                        .format(
                            having_clause[0],
                            error_extra,
                        ))

        if conditions is not None:
            snuba_filter.conditions.extend(conditions)

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.snuba_query"):
        result = raw_query(
            start=snuba_filter.start,
            end=snuba_filter.end,
            groupby=snuba_filter.groupby,
            conditions=snuba_filter.conditions,
            aggregations=snuba_filter.aggregations,
            selected_columns=snuba_filter.selected_columns,
            filter_keys=snuba_filter.filter_keys,
            having=snuba_filter.having,
            orderby=snuba_filter.orderby,
            dataset=Dataset.Discover,
            limit=limit,
            offset=offset,
            referrer=referrer,
        )

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.transform_results") as span:
        span.set_data("result_count", len(result.get("data", [])))
        return transform_results(result, resolved_fields["functions"],
                                 translated_columns, snuba_filter,
                                 selected_columns)
    def get(self, request, organization):
        if not self.has_feature(organization, request):
            return Response(status=404)

        with sentry_sdk.start_span(op="discover.endpoint",
                                   description="parse params"):
            try:
                params = self.get_snuba_params(request, organization)
            except NoProjects:
                return Response([])

            vitals = [
                vital.lower() for vital in request.GET.getlist("vital", [])
            ]
            if len(vitals) == 0:
                raise ParseError(detail="Need to pass at least one vital")

            selected_columns = []
            aliases = {}
            for vital in vitals:
                if vital not in self.VITALS:
                    raise ParseError(detail=f"{vital} is not a valid vital")
                aliases[vital] = []
                for index, threshold in enumerate(
                        self.VITALS[vital]["thresholds"]):
                    column = f"count_at_least({vital}, {threshold})"
                    # Order aliases for later calculation
                    aliases[vital].append(get_function_alias(column))
                    selected_columns.append(column)
                selected_columns.append(f"p75({vital})")

        with self.handle_query_errors():
            events_results = discover.query(
                selected_columns=selected_columns,
                query=request.GET.get("query"),
                params=params,
                # Results should only ever have 1 result
                limit=1,
                referrer="api.events.vitals",
                auto_fields=True,
                auto_aggregations=True,
                use_aggregate_conditions=True,
            )

        results = {}
        if len(events_results["data"]) == 1:
            event_data = events_results["data"][0]
            for vital in vitals:
                groups = len(aliases[vital])
                results[vital] = {}
                total = 0

                # Go backwards so that we can subtract and get the running total
                for i in range(groups - 1, -1, -1):
                    count = event_data[aliases[vital][i]]
                    group_count = 0 if count is None else count - total
                    results[vital][self.LABELS[i]] = group_count
                    total += group_count

                results[vital]["total"] = total
                results[vital]["p75"] = event_data.get(
                    get_function_alias(f"p75({vital})"))

        return Response(results)
    def get_event_stats_data(
        self,
        request,
        organization,
        get_event_stats,
        top_events=False,
        query_column="count()",
        params=None,
        query=None,
    ):
        with self.handle_query_errors():
            with sentry_sdk.start_span(
                    op="discover.endpoint",
                    description="base.stats_query_creation"):
                columns = request.GET.getlist("yAxis", [query_column])
                if query is None:
                    query = request.GET.get("query")
                if params is None:
                    try:
                        # events-stats is still used by events v1 which doesn't require global views
                        params = self.get_snuba_params(
                            request, organization, check_global_views=False)
                    except NoProjects:
                        return {"data": []}

                rollup = get_rollup_from_request(
                    request,
                    params,
                    "1h",
                    InvalidSearchQuery(
                        "Your interval and date range would create too many results. "
                        "Use a larger interval, or a smaller date range."),
                )
                # Backwards compatibility for incidents which uses the old
                # column aliases as it straddles both versions of events/discover.
                # We will need these aliases until discover2 flags are enabled for all
                # users.
                column_map = {
                    "user_count": "count_unique(user)",
                    "event_count": "count()",
                    "epm()": "epm(%d)" % rollup,
                    "eps()": "eps(%d)" % rollup,
                }
                query_columns = [
                    column_map.get(column, column) for column in columns
                ]
            with sentry_sdk.start_span(op="discover.endpoint",
                                       description="base.stats_query"):
                result = get_event_stats(query_columns, query, params, rollup)

        serializer = SnubaTSResultSerializer(organization, None, request.user)

        with sentry_sdk.start_span(op="discover.endpoint",
                                   description="base.stats_serialization"):
            if top_events:
                results = {}
                for key, event_result in six.iteritems(result):
                    if len(query_columns) > 1:
                        results[key] = self.serialize_multiple_axis(
                            serializer, event_result, columns, query_columns)
                    else:
                        # Need to get function alias if count is a field, but not the axis
                        results[key] = serializer.serialize(
                            event_result,
                            column=get_function_alias(query_columns[0]))
                return results
            elif len(query_columns) > 1:
                return self.serialize_multiple_axis(serializer, result,
                                                    columns, query_columns)
            else:
                return serializer.serialize(result)
Exemple #21
0
def histogram_query(
    fields,
    user_query,
    params,
    num_buckets,
    precision=0,
    min_value=None,
    max_value=None,
    data_filter=None,
    referrer=None,
):
    """
    API for generating histograms for numeric columns.

    A multihistogram is possible only if the columns are all array columns.
    Array columns are columns whose values are nested arrays.
    Measurements and span op breakdowns are examples of array columns.
    The resulting histograms will have their bins aligned.

    :param [str] fields: The list of fields for which you want to generate histograms for.
    :param str user_query: Filter query string to create conditions from.
    :param {str: str} params: Filtering parameters with start, end, project_id, environment
    :param int num_buckets: The number of buckets the histogram should contain.
    :param int precision: The number of decimal places to preserve, default 0.
    :param float min_value: The minimum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param float max_value: The maximum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param str data_filter: Indicate the filter strategy to be applied to the data.
    """

    multiplier = int(10**precision)
    if max_value is not None:
        # We want the specified max_value to be exclusive, and the queried max_value
        # to be inclusive. So we adjust the specified max_value using the multiplier.
        max_value -= 0.1 / multiplier
    min_value, max_value = find_histogram_min_max(fields, min_value, max_value,
                                                  user_query, params,
                                                  data_filter)

    key_column = None
    array_column = None
    histogram_function = None
    conditions = []
    if len(fields) > 1:
        array_column = check_multihistogram_fields(fields)
        if array_column == "measurements":
            key_column = "array_join(measurements_key)"
            histogram_function = get_measurement_name
        elif array_column == "span_op_breakdowns":
            key_column = "array_join(span_op_breakdowns_key)"
            histogram_function = get_span_op_breakdown_name
        else:
            raise InvalidSearchQuery(
                "multihistogram expected either all measurements or all breakdowns"
            )

        key_alias = get_function_alias(key_column)
        field_names = [histogram_function(field) for field in fields]
        conditions.append([key_alias, "IN", field_names])

    histogram_params = find_histogram_params(num_buckets, min_value, max_value,
                                             multiplier)
    histogram_column = get_histogram_column(fields, key_column,
                                            histogram_params, array_column)
    histogram_alias = get_function_alias(histogram_column)

    if min_value is None or max_value is None:
        return normalize_histogram_results(fields, key_column,
                                           histogram_params, {"data": []},
                                           array_column)
    # make sure to bound the bins to get the desired range of results
    if min_value is not None:
        min_bin = histogram_params.start_offset
        conditions.append([histogram_alias, ">=", min_bin])
    if max_value is not None:
        max_bin = histogram_params.start_offset + histogram_params.bucket_size * num_buckets
        conditions.append([histogram_alias, "<=", max_bin])

    columns = [] if key_column is None else [key_column]
    results = query(
        selected_columns=columns + [histogram_column, "count()"],
        conditions=conditions,
        query=user_query,
        params=params,
        orderby=[histogram_alias],
        limit=len(fields) * num_buckets,
        referrer=referrer,
        functions_acl=["array_join", "histogram"],
    )

    return normalize_histogram_results(fields, key_column, histogram_params,
                                       results, array_column)
Exemple #22
0
def measurements_histogram_query(
    measurements,
    user_query,
    params,
    num_buckets,
    precision=0,
    min_value=None,
    max_value=None,
    data_filter=None,
    referrer=None,
):
    """
    API for generating histograms specifically for measurements.

    This function allows you to generate histograms for multiple measurements
    at once. The resulting histograms will have their bins aligned.

    :param [str] measurements: The list of measurements for which you want to
        generate histograms for.
    :param str user_query: Filter query string to create conditions from.
    :param {str: str} params: Filtering parameters with start, end, project_id, environment
    :param int num_buckets: The number of buckets the histogram should contain.
    :param int precision: The number of decimal places to preserve, default 0.
    :param float min_value: The minimum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param float max_value: The maximum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param str data_filter: Indicate the filter strategy to be applied to the data.
    """
    multiplier = int(10**precision)
    if max_value is not None:
        # We want the specified max_value to be exclusive, and the queried max_value
        # to be inclusive. So we adjust the specified max_value using the multiplier.
        max_value -= 0.1 / multiplier
    min_value, max_value = find_measurements_min_max(measurements, min_value,
                                                     max_value, user_query,
                                                     params, data_filter)

    key_col = "array_join(measurements_key)"
    histogram_params = find_measurements_histogram_params(
        num_buckets, min_value, max_value, multiplier)
    histogram_col = get_measurements_histogram_col(histogram_params)

    conditions = [[get_function_alias(key_col), "IN", measurements]]

    # make sure to bound the bins as to not get too many results
    if min_value is not None:
        min_bin = histogram_params.start_offset
        conditions.append([get_function_alias(histogram_col), ">=", min_bin])
    if max_value is not None:
        max_bin = histogram_params.start_offset + histogram_params.bucket_size * num_buckets
        conditions.append([get_function_alias(histogram_col), "<=", max_bin])

    results = query(
        selected_columns=[key_col, histogram_col, "count()"],
        conditions=conditions,
        query=user_query,
        params=params,
        # the zerofill step assumes it is ordered by the bin then name
        orderby=[
            get_function_alias(histogram_col),
            get_function_alias(key_col)
        ],
        limit=len(measurements) * num_buckets,
        referrer=referrer,
        auto_fields=True,
        use_aggregate_conditions=True,
        functions_acl=["array_join", "measurements_histogram"],
    )

    return normalize_measurements_histogram(measurements, num_buckets, key_col,
                                            histogram_params, results)
Exemple #23
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    reference_event=None,
    referrer=None,
    auto_fields=False,
    use_aggregate_conditions=False,
    conditions=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    # TODO(evanh): These can be removed once we migrate the frontend / saved queries
    # to use the new function values
    selected_columns, function_translations = transform_deprecated_functions_in_columns(
        selected_columns)
    query = transform_deprecated_functions_in_query(query)

    snuba_filter = get_filter(query, params)
    if not use_aggregate_conditions:
        snuba_filter.having = []

    # We need to run a separate query to be able to properly bucket the values for the histogram
    # Do that here, and format the bucket number in to the columns before passing it through
    # to event search.
    idx = 0
    for col in selected_columns:
        if col.startswith("histogram("):
            histogram_column = find_histogram_buckets(col, params,
                                                      snuba_filter.conditions)
            selected_columns[idx] = histogram_column
            function_translations[get_function_alias(
                histogram_column)] = get_function_alias(col)
            break

        idx += 1

    # Check to see if we are ordering by any functions and convert the orderby to be the correct alias.
    if orderby:
        orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby]
        new_orderby = []
        for ordering in orderby:
            is_reversed = ordering.startswith("-")
            ordering = ordering.lstrip("-")
            for snuba_name, sentry_name in six.iteritems(
                    function_translations):
                if sentry_name == ordering:
                    ordering = snuba_name
                    break

            ordering = "{}{}".format("-" if is_reversed else "", ordering)
            new_orderby.append(ordering)

        snuba_filter.orderby = new_orderby

    snuba_filter.update_with(
        resolve_field_list(selected_columns,
                           snuba_filter,
                           auto_fields=auto_fields))

    if reference_event:
        ref_conditions = create_reference_event_conditions(reference_event)
        if ref_conditions:
            snuba_filter.conditions.extend(ref_conditions)

    # Resolve the public aliases into the discover dataset names.
    snuba_filter, translated_columns = resolve_discover_aliases(
        snuba_filter, function_translations)

    # Make sure that any aggregate conditions are also in the selected columns
    for having_clause in snuba_filter.having:
        found = any(having_clause[0] == agg_clause[-1]
                    for agg_clause in snuba_filter.aggregations)
        if not found:
            raise InvalidSearchQuery(
                u"Aggregate {} used in a condition but is not a selected column."
                .format(having_clause[0]))

    if conditions is not None:
        snuba_filter.conditions.extend(conditions)

    result = raw_query(
        start=snuba_filter.start,
        end=snuba_filter.end,
        groupby=snuba_filter.groupby,
        conditions=snuba_filter.conditions,
        aggregations=snuba_filter.aggregations,
        selected_columns=snuba_filter.selected_columns,
        filter_keys=snuba_filter.filter_keys,
        having=snuba_filter.having,
        orderby=snuba_filter.orderby,
        dataset=Dataset.Discover,
        limit=limit,
        offset=offset,
        referrer=referrer,
    )

    return transform_results(result, translated_columns, snuba_filter,
                             selected_columns)
Exemple #24
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    reference_event=None,
    referrer=None,
    auto_fields=False,
    use_aggregate_conditions=False,
    conditions=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")
    else:
        # We clobber this value throughout this code, so copy the value
        selected_columns = selected_columns[:]

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.filter_transform") as span:
        span.set_data("query", query)

        snuba_filter = get_filter(query, params)
        if not use_aggregate_conditions:
            snuba_filter.having = []

    # We need to run a separate query to be able to properly bucket the values for the histogram
    # Do that here, and format the bucket number in to the columns before passing it through
    # to event search.
    idx = 0
    function_translations = {}
    for col in selected_columns:
        if col.startswith("histogram("):
            with sentry_sdk.start_span(
                    op="discover.discover",
                    description="query.histogram_calculation") as span:
                span.set_data("histogram", col)
                histogram_column = find_histogram_buckets(
                    col, params, snuba_filter.conditions)
                selected_columns[idx] = histogram_column
                snuba_name = get_function_alias(histogram_column)
                sentry_name = get_function_alias(col)
                function_translations[snuba_name] = sentry_name
                # Since we're completely renaming the histogram function, we need to also check if we are
                # ordering by the histogram values, and change that.
                if orderby is not None:
                    orderby = list(orderby) if isinstance(
                        orderby, (list, tuple)) else [orderby]
                    for i, ordering in enumerate(orderby):
                        if sentry_name == ordering.lstrip("-"):
                            ordering = "{}{}".format(
                                "-" if ordering.startswith("-") else "",
                                snuba_name)
                            orderby[i] = ordering

            break

        idx += 1

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.field_translations"):
        if orderby is not None:
            orderby = list(orderby) if isinstance(orderby,
                                                  (list,
                                                   tuple)) else [orderby]
            snuba_filter.orderby = [get_function_alias(o) for o in orderby]

        snuba_filter.update_with(
            resolve_field_list(selected_columns,
                               snuba_filter,
                               auto_fields=auto_fields))

        if reference_event:
            ref_conditions = create_reference_event_conditions(reference_event)
            if ref_conditions:
                snuba_filter.conditions.extend(ref_conditions)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(
            snuba_filter, function_translations)

        # Make sure that any aggregate conditions are also in the selected columns
        for having_clause in snuba_filter.having:
            # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure
            # any referenced functions are in the aggregations.
            if isinstance(having_clause[0], (list, tuple)):
                # Functions are of the form [fn, [args]]
                args_to_check = [[having_clause[0]]]
                conditions_not_in_aggregations = []
                while len(args_to_check) > 0:
                    args = args_to_check.pop()
                    for arg in args:
                        if arg[0] in [SNUBA_AND, SNUBA_OR]:
                            args_to_check.extend(arg[1])
                        else:
                            alias = arg[1][0]
                            found = any(
                                alias == agg_clause[-1]
                                for agg_clause in snuba_filter.aggregations)
                            if not found:
                                conditions_not_in_aggregations.append(alias)

                if len(conditions_not_in_aggregations) > 0:
                    raise InvalidSearchQuery(
                        u"Aggregate(s) {} used in a condition but are not in the selected columns."
                        .format(", ".join(conditions_not_in_aggregations)))
            else:
                found = any(having_clause[0] == agg_clause[-1]
                            for agg_clause in snuba_filter.aggregations)
                if not found:
                    raise InvalidSearchQuery(
                        u"Aggregate {} used in a condition but is not a selected column."
                        .format(having_clause[0]))

        if conditions is not None:
            snuba_filter.conditions.extend(conditions)

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.snuba_query"):
        result = raw_query(
            start=snuba_filter.start,
            end=snuba_filter.end,
            groupby=snuba_filter.groupby,
            conditions=snuba_filter.conditions,
            aggregations=snuba_filter.aggregations,
            selected_columns=snuba_filter.selected_columns,
            filter_keys=snuba_filter.filter_keys,
            having=snuba_filter.having,
            orderby=snuba_filter.orderby,
            dataset=Dataset.Discover,
            limit=limit,
            offset=offset,
            referrer=referrer,
        )

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.transform_results") as span:
        span.set_data("result_count", len(result.get("data", [])))
        return transform_results(result, translated_columns, snuba_filter,
                                 selected_columns)