Ejemplo n.º 1
0
 def test_filter_keys_set(self):
     snuba.raw_query(
         start=datetime.now(),
         end=datetime.now(),
         filter_keys={
             'project_id': set([1]),
             'logger': set(['asdf']),
         },
         aggregations=[
             ['count()', '', 'count'],
         ],
     )
Ejemplo n.º 2
0
    def __search_events_snuba(self, request, project):
        from functools32 import partial
        from sentry.api.paginator import GenericOffsetPaginator
        from sentry.api.serializers.models.event import SnubaEvent
        from sentry.utils.snuba import raw_query

        query = request.GET.get('query')
        conditions = []
        if query:
            conditions.append(
                [['positionCaseInsensitive', ['message', "'%s'" % (query,)]], '!=', 0])

        now = timezone.now()
        data_fn = partial(
            # extract 'data' from raw_query result
            lambda *args, **kwargs: raw_query(*args, **kwargs)['data'],
            start=now - timedelta(days=90),
            end=now,
            conditions=conditions,
            filter_keys={'project_id': [project.id]},
            selected_columns=SnubaEvent.selected_columns,
            orderby='-timestamp',
            referrer='api.project-events',
        )

        return self.paginate(
            request=request,
            on_results=lambda results: serialize(
                [SnubaEvent(row) for row in results], request.user),
            paginator=GenericOffsetPaginator(data_fn=data_fn)
        )
Ejemplo n.º 3
0
    def get(self, request, organization):
        try:
            snuba_args = self.get_snuba_query_args(request, organization)
        except OrganizationEventsError as exc:
            return Response({'detail': exc.message}, status=400)
        except NoProjects:
            return Response({'data': []})

        interval = parse_stats_period(request.GET.get('interval', '1h'))
        if interval is None:
            interval = timedelta(hours=1)

        rollup = int(interval.total_seconds())

        result = raw_query(
            aggregations=[
                ('count()', '', 'count'),
            ],
            orderby='time',
            groupby=['time'],
            rollup=rollup,
            referrer='api.organization-events-stats',
            limit=10000,
            **snuba_args
        )

        serializer = SnubaTSResultSerializer(organization, None, request.user)
        return Response(
            serializer.serialize(
                SnubaTSResult(result, snuba_args['start'], snuba_args['end'], rollup),
            ),
            status=200,
        )
Ejemplo n.º 4
0
    def prev_event_id(self, environments=None):
        from sentry.utils import snuba

        conditions = [
            ['timestamp', '<=', self.timestamp],
            [['timestamp', '<', self.timestamp], ['event_id', '<', self.event_id]]
        ]

        if environments:
            conditions.append(['environment', 'IN', environments])

        result = snuba.raw_query(
            start=datetime.utcfromtimestamp(0),  # will be clamped to project retention
            end=self.datetime,  # lte current event
            selected_columns=['event_id'],
            conditions=conditions,
            filter_keys={
                'project_id': [self.project_id],
                'issue': [self.group_id],
            },
            orderby=['-timestamp', '-event_id'],
            limit=1,
            referrer='SnubaEvent.prev_event_id',
        )

        if 'error' in result or len(result['data']) == 0:
            return None

        return six.text_type(result['data'][0]['event_id'])
Ejemplo n.º 5
0
    def get(self, request, organization):
        try:
            snuba_args = self.get_snuba_query_args(request, organization)
        except OrganizationEventsError as exc:
            return Response({'detail': exc.message}, status=400)
        except NoProjects:
            # return empty result if org doesn't have projects
            # or user doesn't have access to projects in org
            data_fn = lambda *args, **kwargs: []
        else:
            data_fn = partial(
                # extract 'data' from raw_query result
                lambda *args, **kwargs: raw_query(*args, **kwargs)['data'],
                selected_columns=SnubaEvent.selected_columns,
                orderby='-timestamp',
                referrer='api.organization-events',
                **snuba_args
            )

        return self.paginate(
            request=request,
            on_results=lambda results: serialize(
                [SnubaEvent(row) for row in results], request.user),
            paginator=GenericOffsetPaginator(data_fn=data_fn)
        )
Ejemplo n.º 6
0
    def test_use_group_id(self):
        base_time = datetime.utcnow()
        group = self.create_group()
        self._insert_event_for_time(base_time, group_id=group.id)

        with self.options({'snuba.use_group_id_column': True}):
            # verify filter_keys and aggregation
            assert snuba.query(
                start=base_time - timedelta(days=1),
                end=base_time + timedelta(days=1),
                groupby=['issue'],
                filter_keys={
                    'project_id': [self.project.id],
                    'issue': [group.id]
                },
            ) == {group.id: 1}

            # verify raw_query selecting issue row
            assert snuba.raw_query(
                start=base_time - timedelta(days=1),
                end=base_time + timedelta(days=1),
                selected_columns=['issue', 'timestamp'],
                filter_keys={
                    'project_id': [self.project.id],
                    'issue': [group.id]
                },
            )['data'] == [{
                'issue': group.id,
                'timestamp': base_time.strftime('%Y-%m-%dT%H:%M:%S+00:00'),
            }]
Ejemplo n.º 7
0
def get_oldest_or_latest_event_for_environments(
        ordering, environments=(), issue_id=None, project_id=None):
    from sentry.utils import snuba
    from sentry.models import SnubaEvent

    conditions = []

    if len(environments) > 0:
        conditions.append(['environment', 'IN', environments])

    result = snuba.raw_query(
        start=datetime.utcfromtimestamp(0),
        end=datetime.utcnow(),
        selected_columns=SnubaEvent.selected_columns,
        conditions=conditions,
        filter_keys={
            'issue': [issue_id],
            'project_id': [project_id],
        },
        orderby=ordering.value,
        limit=1,
        referrer="Group.get_latest",
    )

    if 'error' not in result and len(result['data']) == 1:
        return SnubaEvent(result['data'][0])

    return None
Ejemplo n.º 8
0
    def get(self, request, organization):
        query = request.GET.get('query')
        conditions = []
        if query:
            conditions.append(
                [['positionCaseInsensitive', ['message', "'%s'" % (query,)]], '!=', 0])

        try:
            start, end = get_date_range_from_params(request.GET)
        except InvalidParams as exc:
            return Response({'detail': exc.message}, status=400)

        try:
            project_ids = self.get_project_ids(request, organization)
        except ValueError:
            return Response({'detail': 'Invalid project ids'}, status=400)

        data_fn = partial(
            # extract 'data' from raw_query result
            lambda *args, **kwargs: raw_query(*args, **kwargs)['data'],
            start=start,
            end=end,
            conditions=conditions,
            filter_keys={'project_id': project_ids},
            selected_columns=SnubaEvent.selected_columns,
            orderby='-timestamp',
        )

        return self.paginate(
            request=request,
            on_results=lambda results: serialize(
                [SnubaEvent(row) for row in results], request.user),
            paginator=GenericOffsetPaginator(data_fn=data_fn)
        )
Ejemplo n.º 9
0
    def get_group_event_filter(self, project_id, group_id, environment_ids, tags, start, end):
        default_start, default_end = self.get_time_range()
        start = max(start, default_start) if start else default_start
        end = min(end, default_end) if end else default_end

        filters = {
            'project_id': [project_id],
            'issue': [group_id],
        }
        if environment_ids:
            filters['environment'] = environment_ids

        conditions = []
        for tag_name, tag_val in tags.items():
            operator = 'IN' if isinstance(tag_val, list) else '='
            conditions.append([u'tags[{}]'.format(tag_name), operator, tag_val])

        result = snuba.raw_query(start, end, selected_columns=['event_id'],
                                 conditions=conditions, orderby='-timestamp', filter_keys=filters,
                                 limit=1000, referrer='tagstore.get_group_event_filter')

        event_id_set = set(row['event_id'] for row in result['data'])

        if not event_id_set:
            return None

        return {'event_id__in': event_id_set}
Ejemplo n.º 10
0
    def test_shrink_timeframe(self):
        now = datetime.now()
        year_ago = now - timedelta(days=365)
        year_ahead = now + timedelta(days=365)

        issues = None
        assert snuba.shrink_time_window(issues, year_ago, year_ahead) == (year_ago, year_ahead)

        issues = []
        assert snuba.shrink_time_window(issues, year_ago, year_ahead) == (year_ago, year_ahead)

        group1 = self.create_group()
        group1.first_seen = now - timedelta(hours=1)
        group1.last_seen = now
        group1.save()
        GroupHash.objects.create(project_id=group1.project_id, group=group1, hash='a' * 32)

        group2 = self.create_group()
        GroupHash.objects.create(project_id=group2.project_id, group=group2, hash='b' * 32)

        # issues is a list like [(gid, pid, [(hash, tombstone_date), ...]), ...]
        issues = [(group1.id, group1.project_id, [('a' * 32, None)])]
        assert snuba.shrink_time_window(issues, year_ago, year_ahead) == \
            (now - timedelta(hours=1, minutes=5), now + timedelta(minutes=5))

        issues = [
            (group1.id, group1.project_id, [('a' * 32, None)]),
            (group2.id, group2.project_id, [('b' * 32, None)]),
        ]
        assert snuba.shrink_time_window(issues, year_ago, year_ahead) == (year_ago, year_ahead)

        with pytest.raises(snuba.QueryOutsideGroupActivityError):
            # query a group for a time range before it had any activity
            snuba.raw_query(
                start=group1.first_seen - timedelta(days=1, hours=1),
                end=group1.first_seen - timedelta(days=1),
                filter_keys={
                    'project_id': [group1.project_id],
                    'issue': [group1.id],
                },
                aggregations=[
                    ['count()', '', 'count'],
                ],
            )
    def do_query(self, projects, **kwargs):
        requested_query = deepcopy(kwargs)

        selected_columns = kwargs['selected_columns']
        groupby_columns = kwargs['groupby']

        if 'project_name' in requested_query['selected_columns']:
            selected_columns.remove('project_name')
            if 'project_id' not in selected_columns:
                selected_columns.append('project_id')

        if 'project_name' in requested_query['groupby']:
            groupby_columns.remove('project_name')
            if 'project_id' not in groupby_columns:
                groupby_columns.append('project_id')

        for aggregation in kwargs['aggregations']:
            if aggregation[1] == 'project_name':
                aggregation[1] = 'project_id'

        snuba_results = snuba.raw_query(
            referrer='discover',
            **kwargs
        )

        if 'project_name' in requested_query['selected_columns']:
            project_name_index = requested_query['selected_columns'].index('project_name')
            snuba_results['meta'].insert(project_name_index, {'name': 'project_name'})
            if 'project_id' not in requested_query['selected_columns']:
                snuba_results['meta'] = [
                    field for field in snuba_results['meta'] if field['name'] != 'project_id'
                ]

            for result in snuba_results['data']:
                result['project_name'] = projects[result['project_id']]
                if 'project_id' not in requested_query['selected_columns']:
                    del result['project_id']

        if 'project_name' in requested_query['groupby']:
            project_name_index = requested_query['groupby'].index('project_name')
            snuba_results['meta'].insert(project_name_index, {'name': 'project_name'})
            if 'project_id' not in requested_query['groupby']:
                snuba_results['meta'] = [
                    field for field in snuba_results['meta'] if field['name'] != 'project_id'
                ]

            for result in snuba_results['data']:
                result['project_name'] = projects[result['project_id']]
                if 'project_id' not in requested_query['groupby']:
                    del result['project_id']

        # Only return the meta propety "name"
        snuba_results['meta'] = [{'name': field['name']} for field in snuba_results['meta']]

        return snuba_results
Ejemplo n.º 12
0
    def do_query(self, start, end, groupby, **kwargs):

        snuba_results = snuba.raw_query(
            start=start,
            end=end,
            groupby=groupby,
            referrer='discover',
            **kwargs
        )

        return snuba_results
Ejemplo n.º 13
0
def get_incident_aggregates(incident):
    kwargs = build_incident_query_params(incident)
    return raw_query(
        aggregations=[
            ('count()', '', 'count'),
            ('uniq', 'tags[sentry:user]', 'unique_users'),
        ],
        referrer='incidents.get_incident_aggregates',
        limit=10000,
        **kwargs
    )['data'][0]
Ejemplo n.º 14
0
    def test_shrink_timeframe(self):
        now = datetime.now()
        year_ago = now - timedelta(days=365)

        issues = None
        assert snuba.shrink_time_window(issues, year_ago) == year_ago

        issues = []
        assert snuba.shrink_time_window(issues, year_ago) == year_ago

        group1 = self.create_group()
        group1.first_seen = now - timedelta(hours=1)
        group1.last_seen = now
        group1.save()
        GroupHash.objects.create(project_id=group1.project_id, group=group1, hash='a' * 32)

        group2 = self.create_group()
        GroupHash.objects.create(project_id=group2.project_id, group=group2, hash='b' * 32)

        issues = [group1.id]
        assert snuba.shrink_time_window(issues, year_ago) == \
            now - timedelta(hours=1, minutes=5)

        issues = [group1.id, group2.id]
        assert snuba.shrink_time_window(issues, year_ago) == year_ago

        with pytest.raises(snuba.QueryOutsideGroupActivityError):
            # query a group for a time range before it had any activity
            snuba.raw_query(
                start=group1.first_seen - timedelta(days=1, hours=1),
                end=group1.first_seen - timedelta(days=1),
                filter_keys={
                    'project_id': [group1.project_id],
                    'issue': [group1.id],
                },
                aggregations=[
                    ['count()', '', 'count'],
                ],
            )
Ejemplo n.º 15
0
 def get_event(cls, project_id, event_id, snuba_cols=selected_columns):
     from sentry.utils import snuba
     result = snuba.raw_query(
         start=datetime.utcfromtimestamp(0),  # will be clamped to project retention
         end=datetime.utcnow(),  # will be clamped to project retention
         selected_columns=snuba_cols,
         filter_keys={
             'event_id': [event_id],
             'project_id': [project_id],
         },
         referrer='SnubaEvent.get_event',
     )
     if 'error' not in result and len(result['data']) == 1:
         return SnubaEvent(result['data'][0])
     return None
Ejemplo n.º 16
0
    def _get_events_snuba(self, request, group, environments, query, tags, start, end):
        default_end = timezone.now()
        default_start = default_end - timedelta(days=90)
        params = {
            'issue.id': [group.id],
            'project_id': [group.project_id],
            'start': start if start else default_start,
            'end': end if end else default_end
        }
        direct_hit_resp = get_direct_hit_response(request, query, params, 'api.group-events')
        if direct_hit_resp:
            return direct_hit_resp

        if environments:
            params['environment'] = [env.name for env in environments]

        full = request.GET.get('full', False)
        snuba_args = get_snuba_query_args(request.GET.get('query', None), params)

        # TODO(lb): remove once boolean search is fully functional
        if snuba_args:
            has_boolean_op_flag = features.has(
                'organizations:boolean-search',
                group.project.organization,
                actor=request.user
            )
            if snuba_args.pop('has_boolean_terms', False) and not has_boolean_op_flag:
                raise GroupEventsError(
                    'Boolean search operator OR and AND not allowed in this search.')

        snuba_cols = SnubaEvent.minimal_columns if full else SnubaEvent.selected_columns

        data_fn = partial(
            # extract 'data' from raw_query result
            lambda *args, **kwargs: raw_query(*args, **kwargs)['data'],
            selected_columns=snuba_cols,
            orderby='-timestamp',
            referrer='api.group-events',
            **snuba_args
        )

        serializer = EventSerializer() if full else SimpleEventSerializer()
        return self.paginate(
            request=request,
            on_results=lambda results: serialize(
                [SnubaEvent(row) for row in results], request.user, serializer),
            paginator=GenericOffsetPaginator(data_fn=data_fn)
        )
Ejemplo n.º 17
0
    def get(self, request, organization):
        # Check for a direct hit on event ID
        query = request.GET.get('query', '').strip()

        try:
            direct_hit_resp = get_direct_hit_response(
                request,
                query,
                self.get_filter_params(request, organization),
                'api.organization-events'
            )
        except (OrganizationEventsError, NoProjects):
            pass
        else:
            if direct_hit_resp:
                return direct_hit_resp

        full = request.GET.get('full', False)
        try:
            snuba_args = self.get_snuba_query_args(request, organization)
        except OrganizationEventsError as exc:
            return Response({'detail': exc.message}, status=400)
        except NoProjects:
            # return empty result if org doesn't have projects
            # or user doesn't have access to projects in org
            data_fn = lambda *args, **kwargs: []
        else:
            snuba_cols = SnubaEvent.minimal_columns if full else SnubaEvent.selected_columns
            data_fn = partial(
                # extract 'data' from raw_query result
                lambda *args, **kwargs: raw_query(*args, **kwargs)['data'],
                selected_columns=snuba_cols,
                orderby='-timestamp',
                referrer='api.organization-events',
                **snuba_args
            )

        serializer = EventSerializer() if full else SimpleEventSerializer()
        return self.paginate(
            request=request,
            on_results=lambda results: serialize(
                [SnubaEvent(row) for row in results], request.user, serializer),
            paginator=GenericOffsetPaginator(data_fn=data_fn)
        )
Ejemplo n.º 18
0
def get_incident_event_stats(incident, data_points=20):
    kwargs = build_incident_query_params(incident)
    rollup = max(int(incident.duration.total_seconds() / data_points), 1)
    return SnubaTSResult(
        raw_query(
            aggregations=[
                ('count()', '', 'count'),
            ],
            orderby='time',
            groupby=['time'],
            rollup=rollup,
            referrer='incidents.get_incident_event_stats',
            limit=10000,
            **kwargs
        ),
        kwargs['start'],
        kwargs['end'],
        rollup,
    )
Ejemplo n.º 19
0
    def get_group_event_filter(self, project_id, group_id, environment_id, tags):
        start, end = self.get_time_range()
        filters = {
            'project_id': [project_id],
            'environment': [environment_id],
            'issue': [group_id],
        }

        conditions = [[u'tags[{}]'.format(k), '=', v] for (k, v) in tags.items()]

        result = snuba.raw_query(start, end, selected_columns=['event_id'],
                                 conditions=conditions, orderby='-timestamp', filter_keys=filters,
                                 limit=1000, referrer='tagstore.get_group_event_filter')

        event_id_set = set(row['event_id'] for row in result['data'])

        if not event_id_set:
            return None

        return {'event_id__in': event_id_set}
Ejemplo n.º 20
0
    def get(self, request, organization):
        try:
            snuba_args = self.get_snuba_query_args(request, organization)
        except OrganizationEventsError as exc:
            return Response({'detail': exc.message}, status=400)
        except NoProjects:
            return Response({'count': 0})

        data = raw_query(
            aggregations=[['count()', '', 'count']],
            referrer='api.organization-event-meta',
            turbo=True,
            **snuba_args
        )['data'][0]

        return Response({
            # this needs to be multiplied to account for the `TURBO_SAMPLE_RATE`
            # in snuba
            'count': data['count'] * 10,
        })
Ejemplo n.º 21
0
def get_direct_hit_response(request, query, snuba_params, referrer):
    """
    Checks whether a query is a direct hit for an event, and if so returns
    a response. Otherwise returns None
    """
    if is_event_id(query):
        snuba_args = get_snuba_query_args(
            query=u'id:{}'.format(query),
            params=snuba_params)

        results = raw_query(
            selected_columns=SnubaEvent.selected_columns,
            referrer=referrer,
            **snuba_args
        )['data']

        if len(results) == 1:
            response = Response(
                serialize([SnubaEvent(row) for row in results], request.user)
            )
            response['X-Sentry-Direct-Hit'] = '1'
            return response
Ejemplo n.º 22
0
def calculate_incident_start(query, projects, groups):
    """
    Attempts to automatically calculate the date that an incident began at based
    on the events related to the incident.
    """
    params = {}
    if groups:
        params["issue.id"] = [g.id for g in groups]
        end = max(g.last_seen for g in groups) + timedelta(seconds=1)
    else:
        end = timezone.now()

    params["start"] = end - INCIDENT_START_PERIOD
    params["end"] = end

    if projects:
        params["project_id"] = [p.id for p in projects]

    query_args = get_snuba_query_args(query, params)
    rollup = int(INCIDENT_START_ROLLUP.total_seconds())

    result = raw_query(aggregations=[("count()", "", "count"),
                                     ("min", "timestamp", "first_seen")],
                       orderby="time",
                       groupby=["time"],
                       rollup=rollup,
                       referrer="incidents.calculate_incident_start",
                       limit=10000,
                       **query_args)["data"]
    # TODO: Start could be the period before the first period we find
    result = zerofill(result, params["start"], params["end"], rollup, "time")

    # We want to linearly scale scores from 100% value at the most recent to
    # 50% at the oldest. This gives a bias towards newer results.
    negative_weight = (1.0 / len(result)) / 2
    multiplier = 1.0
    cur_spike_max_count = -1
    cur_spike_start = None
    cur_spike_end = None
    max_height = 0
    incident_start = None
    cur_height = 0
    prev_count = 0

    def get_row_first_seen(row, default=None):
        first_seen = default
        if "first_seen" in row:
            first_seen = parse_date(row["first_seen"]).replace(tzinfo=pytz.utc)
        return first_seen

    def calculate_start(spike_start, spike_end):
        """
        We arbitrarily choose a date about 1/3 into the incident period. We
        could potentially improve this if we want by analyzing the period in
        more detail and choosing a date that most closely fits with being 1/3
        up the spike.
        """
        spike_length = spike_end - spike_start
        return spike_start + (spike_length / 3)

    for row in reversed(result):
        cur_count = row.get("count", 0)
        if cur_count < prev_count or cur_count > 0 and cur_count == prev_count:
            cur_height = cur_spike_max_count - cur_count
        elif cur_count > 0 or prev_count > 0 or cur_height > 0:
            # Now we've got the height of the current spike, compare it to the
            # current max. We decrease the value by `multiplier` so that we
            # favour newer results
            cur_height *= multiplier
            if cur_height > max_height:
                # If we detect that we have a new highest peak, then set a new
                # incident start date
                incident_start = calculate_start(cur_spike_start,
                                                 cur_spike_end)
                max_height = cur_height

            cur_height = 0
            cur_spike_max_count = cur_count
            cur_spike_end = get_row_first_seen(row)

        # We attempt to get the first_seen value from the row here. If the row
        # doesn't have it (because it's a zerofilled row), then just use the
        # previous value. This allows us to have the start of a spike always be
        # a bucket that contains at least one element.
        cur_spike_start = get_row_first_seen(row, cur_spike_start)
        prev_count = cur_count
        multiplier -= negative_weight

    if (cur_height > max_height or not incident_start) and cur_spike_start:
        incident_start = calculate_start(cur_spike_start, cur_spike_end)

    if not incident_start:
        incident_start = timezone.now()

    return incident_start
Ejemplo n.º 23
0
def get_release_health_data_overview(
    project_releases,
    environments=None,
    summary_stats_period=None,
    health_stats_period=None,
    stat=None,
):
    """Checks quickly for which of the given project releases we have
    health data available.  The argument is a tuple of `(project_id, release_name)`
    tuples.  The return value is a set of all the project releases that have health
    data.
    """
    if stat is None:
        stat = "sessions"
    assert stat in ("sessions", "users")

    _, summary_start, _ = get_rollup_starts_and_buckets(summary_stats_period
                                                        or "24h")
    conditions, filter_keys = _get_conditions_and_filter_keys(
        project_releases, environments)

    stats_rollup, stats_start, stats_buckets = get_rollup_starts_and_buckets(
        health_stats_period)

    missing_releases = set(project_releases)
    rv = {}
    for x in raw_query(
            dataset=Dataset.Sessions,
            selected_columns=[
                "release",
                "project_id",
                "duration_quantiles",
                "users",
                "sessions",
                "sessions_errored",
                "sessions_crashed",
                "users_crashed",
            ],
            groupby=["release", "project_id"],
            start=summary_start,
            conditions=conditions,
            filter_keys=filter_keys,
    )["data"]:
        rp = {
            "duration_p50":
            _convert_duration(x["duration_quantiles"][0]),
            "duration_p90":
            _convert_duration(x["duration_quantiles"][1]),
            "crash_free_users":
            (100 - x["users_crashed"] / float(x["users"]) * 100
             if x["users"] else None),
            "crash_free_sessions":
            (100 - x["sessions_crashed"] / float(x["sessions"]) * 100
             if x["sessions"] else None),
            "total_users":
            x["users"],
            "total_sessions":
            x["sessions"],
            "sessions_crashed":
            x["sessions_crashed"],
            "sessions_errored":
            x["sessions_errored"],
            "has_health_data":
            True,
        }
        if health_stats_period:
            rp["stats"] = {
                health_stats_period:
                _make_stats(stats_start, stats_rollup, stats_buckets)
            }
        rv[x["project_id"], x["release"]] = rp
        missing_releases.discard((x["project_id"], x["release"]))

    # Add releases without data points
    if missing_releases:
        # If we're already looking at a 90 day horizont we don't need to
        # fire another query, we can already assume there is no data.
        if summary_stats_period != "90d":
            has_health_data = check_has_health_data(missing_releases)
        else:
            has_health_data = ()
        for key in missing_releases:
            rv[key] = {
                "duration_p50": None,
                "duration_p90": None,
                "crash_free_users": None,
                "crash_free_sessions": None,
                "total_users": 0,
                "total_sessions": 0,
                "sessions_crashed": 0,
                "sessions_errored": 0,
                "has_health_data": key in has_health_data,
            }
            if health_stats_period:
                rv[key]["stats"] = {
                    health_stats_period:
                    _make_stats(stats_start, stats_rollup, stats_buckets)
                }

    # Fill in release adoption
    release_adoption = get_release_adoption(project_releases, environments)
    for key in rv:
        adoption_info = release_adoption.get(key) or {}
        rv[key]["adoption"] = adoption_info.get("adoption")
        rv[key]["total_users_24h"] = adoption_info.get("users_24h")
        rv[key]["total_sessions_24h"] = adoption_info.get("sessions_24h")

    if health_stats_period:
        for x in raw_query(
                dataset=Dataset.Sessions,
                selected_columns=[
                    "release", "project_id", "bucketed_started", stat
                ],
                groupby=["release", "project_id", "bucketed_started"],
                rollup=stats_rollup,
                start=stats_start,
                conditions=conditions,
                filter_keys=filter_keys,
        )["data"]:
            time_bucket = int((parse_snuba_datetime(x["bucketed_started"]) -
                               stats_start).total_seconds() / stats_rollup)
            rv[x["project_id"], x["release"]]["stats"][health_stats_period][
                time_bucket][1] = x[stat]

    return rv
Ejemplo n.º 24
0
def query(**kwargs):
    kwargs['referrer'] = 'health'
    kwargs['totals'] = True
    return snuba.raw_query(**kwargs)
Ejemplo n.º 25
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    reference_event=None,
    referrer=None,
    auto_fields=False,
    use_aggregate_conditions=False,
    conditions=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    # TODO(evanh): These can be removed once we migrate the frontend / saved queries
    # to use the new function values
    selected_columns, function_translations = transform_deprecated_functions_in_columns(
        selected_columns)
    query = transform_deprecated_functions_in_query(query)

    snuba_filter = get_filter(query, params)

    # TODO(mark) Refactor the need for this translation shim once all of
    # discover is using this module. Remember to update all the functions
    # in this module.
    snuba_args = {
        "start": snuba_filter.start,
        "end": snuba_filter.end,
        "conditions": snuba_filter.conditions,
        "filter_keys": snuba_filter.filter_keys,
        "orderby": orderby,
        "having": [],
    }

    if use_aggregate_conditions:
        snuba_args["having"] = snuba_filter.having

    # We need to run a separate query to be able to properly bucket the values for the histogram
    # Do that here, and format the bucket number in to the columns before passing it through
    # to event search.
    idx = 0
    for col in selected_columns:
        if col.startswith("histogram("):
            histogram_column = find_histogram_buckets(col, params,
                                                      snuba_filter.conditions)
            selected_columns[idx] = histogram_column
            function_translations[get_function_alias(
                histogram_column)] = get_function_alias(col)
            break

        idx += 1

    # Check to see if we are ordering by any functions and convert the orderby to be the correct alias.
    if orderby:
        orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby]
        new_orderby = []
        for ordering in orderby:
            is_reversed = ordering.startswith("-")
            ordering = ordering.lstrip("-")
            for snuba_name, sentry_name in six.iteritems(
                    function_translations):
                if sentry_name == ordering:
                    ordering = snuba_name
                    break

            ordering = "{}{}".format("-" if is_reversed else "", ordering)
            new_orderby.append(ordering)

        snuba_args["orderby"] = new_orderby

    snuba_args.update(
        resolve_field_list(selected_columns,
                           snuba_args,
                           params=params,
                           auto_fields=auto_fields))

    if reference_event:
        ref_conditions = create_reference_event_conditions(reference_event)
        if ref_conditions:
            snuba_args["conditions"].extend(ref_conditions)

    # Resolve the public aliases into the discover dataset names.
    snuba_args, translated_columns = resolve_discover_aliases(
        snuba_args, function_translations)

    # Make sure that any aggregate conditions are also in the selected columns
    for having_clause in snuba_args.get("having"):
        found = any(having_clause[0] == agg_clause[-1]
                    for agg_clause in snuba_args.get("aggregations"))
        if not found:
            raise InvalidSearchQuery(
                u"Aggregate {} used in a condition but is not a selected column."
                .format(having_clause[0]))

    if conditions is not None:
        snuba_args["conditions"].extend(conditions)

    result = raw_query(
        start=snuba_args.get("start"),
        end=snuba_args.get("end"),
        groupby=snuba_args.get("groupby"),
        conditions=snuba_args.get("conditions"),
        aggregations=snuba_args.get("aggregations"),
        selected_columns=snuba_args.get("selected_columns"),
        filter_keys=snuba_args.get("filter_keys"),
        having=snuba_args.get("having"),
        orderby=snuba_args.get("orderby"),
        dataset=Dataset.Discover,
        limit=limit,
        offset=offset,
        referrer=referrer,
    )

    return transform_results(result, translated_columns, snuba_args)
Ejemplo n.º 26
0
def histogram_query(
    fields,
    user_query,
    params,
    num_buckets,
    precision=0,
    min_value=None,
    max_value=None,
    data_filter=None,
    referrer=None,
    group_by=None,
    order_by=None,
    limit_by=None,
    extra_conditions=None,
    normalize_results=True,
):
    """
    API for generating histograms for numeric columns.

    A multihistogram is possible only if the columns are all array columns.
    Array columns are columns whose values are nested arrays.
    Measurements and span op breakdowns are examples of array columns.
    The resulting histograms will have their bins aligned.

    :param [str] fields: The list of fields for which you want to generate histograms for.
    :param str user_query: Filter query string to create conditions from.
    :param {str: str} params: Filtering parameters with start, end, project_id, environment
    :param int num_buckets: The number of buckets the histogram should contain.
    :param int precision: The number of decimal places to preserve, default 0.
    :param float min_value: The minimum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param float max_value: The maximum value allowed to be in the histogram.
        If left unspecified, it is queried using `user_query` and `params`.
    :param str data_filter: Indicate the filter strategy to be applied to the data.
    :param [str] group_by: Experimental. Allows additional grouping to serve multifacet histograms.
    :param [str] order_by: Experimental. Allows additional ordering within each alias to serve multifacet histograms.
    :param [str] limit_by: Experimental. Allows limiting within a group when serving multifacet histograms.
    :param [str] extra_conditions: Adds any additional conditions to the histogram query that aren't received from params.
    :param bool normalize_results: Indicate whether to normalize the results by column into bins.
    """

    multiplier = int(10**precision)
    if max_value is not None:
        # We want the specified max_value to be exclusive, and the queried max_value
        # to be inclusive. So we adjust the specified max_value using the multiplier.
        max_value -= 0.1 / multiplier
    min_value, max_value = find_histogram_min_max(fields, min_value, max_value,
                                                  user_query, params,
                                                  data_filter)

    key_column = None
    array_column = None
    histogram_function = None
    conditions = []
    if len(fields) > 1:
        array_column = check_multihistogram_fields(fields)
        if array_column == "measurements":
            key_column = "array_join(measurements_key)"
            histogram_function = get_measurement_name
        elif array_column == "span_op_breakdowns":
            key_column = "array_join(span_op_breakdowns_key)"
            histogram_function = get_span_op_breakdown_name
        else:
            raise InvalidSearchQuery(
                "multihistogram expected either all measurements or all breakdowns"
            )

        key_alias = get_function_alias(key_column)
        field_names = [histogram_function(field) for field in fields]
        conditions.append([key_alias, "IN", field_names])

    if extra_conditions:
        conditions.extend(extra_conditions)

    histogram_params = find_histogram_params(num_buckets, min_value, max_value,
                                             multiplier)
    histogram_column = get_histogram_column(fields, key_column,
                                            histogram_params, array_column)
    histogram_alias = get_function_alias(histogram_column)

    if min_value is None or max_value is None:
        return normalize_histogram_results(fields, key_column,
                                           histogram_params, {"data": []},
                                           array_column)
    # make sure to bound the bins to get the desired range of results
    if min_value is not None:
        min_bin = histogram_params.start_offset
        conditions.append([histogram_alias, ">=", min_bin])
    if max_value is not None:
        max_bin = histogram_params.start_offset + histogram_params.bucket_size * num_buckets
        conditions.append([histogram_alias, "<=", max_bin])

    columns = [] if key_column is None else [key_column]
    limit = len(fields) * num_buckets

    histogram_query = prepare_discover_query(
        selected_columns=columns + [histogram_column, "count()"],
        conditions=conditions,
        query=user_query,
        params=params,
        orderby=(order_by if order_by else []) + [histogram_alias],
        functions_acl=["array_join", "histogram"],
    )

    snuba_filter = histogram_query.filter

    if group_by:
        snuba_filter.groupby += group_by

    result = raw_query(
        start=snuba_filter.start,
        end=snuba_filter.end,
        groupby=snuba_filter.groupby,
        conditions=snuba_filter.conditions,
        aggregations=snuba_filter.aggregations,
        selected_columns=snuba_filter.selected_columns,
        filter_keys=snuba_filter.filter_keys,
        having=snuba_filter.having,
        orderby=snuba_filter.orderby,
        dataset=Dataset.Discover,
        limitby=limit_by,
        limit=limit,
        referrer=referrer,
    )

    results = transform_results(
        result,
        histogram_query.fields["functions"],
        histogram_query.columns,
        snuba_filter,
    )

    if not normalize_results:
        return results

    return normalize_histogram_results(fields, key_column, histogram_params,
                                       results, array_column)
Ejemplo n.º 27
0
def snuba_search(start, end, project_ids, environment_ids, sort_field,
                 cursor=None, candidate_ids=None, limit=None, offset=0,
                 get_sample=False, search_filters=None):
    """
    This function doesn't strictly benefit from or require being pulled out of the main
    query method above, but the query method is already large and this function at least
    extracts most of the Snuba-specific logic.

    Returns a tuple of:
     * a sorted list of (group_id, group_score) tuples sorted descending by score,
     * the count of total results (rows) available for this query.
    """
    filters = {
        'project_id': project_ids,
    }

    if environment_ids is not None:
        filters['environment'] = environment_ids

    if candidate_ids:
        filters['issue'] = candidate_ids

    conditions = []
    having = []
    for search_filter in search_filters:
        if (
            # Don't filter on issue fields here, they're not available
            search_filter.key.name in issue_only_fields or
            # We special case date
            search_filter.key.name == 'date'
        ):
            continue
        converted_filter = convert_search_filter_to_snuba_query(search_filter)

        # Ensure that no user-generated tags that clashes with aggregation_defs is added to having
        if search_filter.key.name in aggregation_defs and not search_filter.key.is_tag:
            having.append(converted_filter)
        else:
            conditions.append(converted_filter)

    extra_aggregations = dependency_aggregations.get(sort_field, [])
    required_aggregations = set([sort_field, 'total'] + extra_aggregations)
    for h in having:
        alias = h[0]
        required_aggregations.add(alias)

    aggregations = []
    for alias in required_aggregations:
        aggregations.append(aggregation_defs[alias] + [alias])

    if cursor is not None:
        having.append((sort_field, '>=' if cursor.is_prev else '<=', cursor.value))

    selected_columns = []
    if get_sample:
        query_hash = md5(repr(conditions)).hexdigest()[:8]
        selected_columns.append(('cityHash64', ("'{}'".format(query_hash), 'issue'), 'sample'))
        sort_field = 'sample'
        orderby = [sort_field]
        referrer = 'search_sample'
    else:
        # Get the top matching groups by score, i.e. the actual search results
        # in the order that we want them.
        orderby = ['-{}'.format(sort_field), 'issue']  # ensure stable sort within the same score
        referrer = 'search'

    snuba_results = snuba.raw_query(
        start=start,
        end=end,
        selected_columns=selected_columns,
        groupby=['issue'],
        conditions=conditions,
        having=having,
        filter_keys=filters,
        aggregations=aggregations,
        orderby=orderby,
        referrer=referrer,
        limit=limit,
        offset=offset,
        totals=True,  # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only
        turbo=get_sample,  # Turn off FINAL when in sampling mode
        sample=1,  # Don't use clickhouse sampling, even when in turbo mode.
    )
    rows = snuba_results['data']
    total = snuba_results['totals']['total']

    if not get_sample:
        metrics.timing('snuba.search.num_result_groups', len(rows))

    return [(row['issue'], row[sort_field]) for row in rows], total
Ejemplo n.º 28
0
def snuba_search(start,
                 end,
                 project_ids,
                 environment_ids,
                 sort_field,
                 cursor=None,
                 candidate_ids=None,
                 limit=None,
                 offset=0,
                 get_sample=False,
                 search_filters=None):
    """
    This function doesn't strictly benefit from or require being pulled out of the main
    query method above, but the query method is already large and this function at least
    extracts most of the Snuba-specific logic.

    Returns a tuple of:
     * a sorted list of (group_id, group_score) tuples sorted descending by score,
     * the count of total results (rows) available for this query.
    """
    filters = {
        'project_id': project_ids,
    }

    if environment_ids is not None:
        filters['environment'] = environment_ids

    if candidate_ids:
        filters['issue'] = candidate_ids

    conditions = []
    having = []
    for search_filter in search_filters:
        if (
                # Don't filter on issue fields here, they're not available
                search_filter.key.name in issue_only_fields or
                # We special case date
                search_filter.key.name == 'date'):
            continue
        converted_filter = convert_search_filter_to_snuba_query(search_filter)

        # Ensure that no user-generated tags that clashes with aggregation_defs is added to having
        if search_filter.key.name in aggregation_defs and not search_filter.key.is_tag:
            having.append(converted_filter)
        else:
            conditions.append(converted_filter)

    extra_aggregations = dependency_aggregations.get(sort_field, [])
    required_aggregations = set([sort_field, 'total'] + extra_aggregations)
    for h in having:
        alias = h[0]
        required_aggregations.add(alias)

    aggregations = []
    for alias in required_aggregations:
        aggregations.append(aggregation_defs[alias] + [alias])

    if cursor is not None:
        having.append(
            (sort_field, '>=' if cursor.is_prev else '<=', cursor.value))

    selected_columns = []
    if get_sample:
        query_hash = md5(repr(conditions)).hexdigest()[:8]
        selected_columns.append(
            ('cityHash64', ("'{}'".format(query_hash), 'issue'), 'sample'))
        sort_field = 'sample'
        orderby = [sort_field]
        referrer = 'search_sample'
    else:
        # Get the top matching groups by score, i.e. the actual search results
        # in the order that we want them.
        orderby = ['-{}'.format(sort_field),
                   'issue']  # ensure stable sort within the same score
        referrer = 'search'

    snuba_results = snuba.raw_query(
        start=start,
        end=end,
        selected_columns=selected_columns,
        groupby=['issue'],
        conditions=conditions,
        having=having,
        filter_keys=filters,
        aggregations=aggregations,
        orderby=orderby,
        referrer=referrer,
        limit=limit,
        offset=offset,
        totals=
        True,  # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only
        turbo=get_sample,  # Turn off FINAL when in sampling mode
        sample=1,  # Don't use clickhouse sampling, even when in turbo mode.
    )
    rows = snuba_results['data']
    total = snuba_results['totals']['total']

    if not get_sample:
        metrics.timing('snuba.search.num_result_groups', len(rows))

    return [(row['issue'], row[sort_field]) for row in rows], total
Ejemplo n.º 29
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    reference_event=None,
    referrer=None,
    auto_fields=False,
    use_aggregate_conditions=False,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    snuba_filter = get_filter(query, params)

    # TODO(mark) Refactor the need for this translation shim once all of
    # discover is using this module. Remember to update all the functions
    # in this module.
    snuba_args = {
        "start": snuba_filter.start,
        "end": snuba_filter.end,
        "conditions": snuba_filter.conditions,
        "filter_keys": snuba_filter.filter_keys,
        "orderby": orderby,
        "having": [],
    }

    if use_aggregate_conditions:
        snuba_args["having"] = snuba_filter.having

    snuba_args.update(
        resolve_field_list(selected_columns,
                           snuba_args,
                           auto_fields=auto_fields))

    if reference_event:
        ref_conditions = create_reference_event_conditions(reference_event)
        if ref_conditions:
            snuba_args["conditions"].extend(ref_conditions)

    # Resolve the public aliases into the discover dataset names.
    snuba_args, translated_columns = resolve_discover_aliases(snuba_args)

    # Make sure that any aggregate conditions are also in the selected columns
    for having_clause in snuba_args.get("having"):
        found = any(having_clause[0] == agg_clause[-1]
                    for agg_clause in snuba_args.get("aggregations"))
        if not found:
            raise InvalidSearchQuery(
                u"Aggregate {} used in a condition but is not a selected column."
                .format(having_clause[0]))

    result = raw_query(
        start=snuba_args.get("start"),
        end=snuba_args.get("end"),
        groupby=snuba_args.get("groupby"),
        conditions=snuba_args.get("conditions"),
        aggregations=snuba_args.get("aggregations"),
        selected_columns=snuba_args.get("selected_columns"),
        filter_keys=snuba_args.get("filter_keys"),
        having=snuba_args.get("having"),
        orderby=snuba_args.get("orderby"),
        dataset=Dataset.Discover,
        limit=limit,
        offset=offset,
        referrer=referrer,
    )

    return transform_results(result, translated_columns, snuba_args)
Ejemplo n.º 30
0
def timeseries_query(selected_columns,
                     query,
                     params,
                     rollup,
                     reference_event=None,
                     referrer=None):
    """
    High-level API for doing arbitrary user timeseries queries against events.

    This function operates on the public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    This function is intended to only get timeseries based
    results and thus requires the `rollup` parameter.

    Returns a SnubaTSResult object that has been zerofilled in
    case of gaps.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment,
    rollup (int) The bucket width in seconds
    reference_event (ReferenceEvent) A reference event object. Used to generate additional
                    conditions based on the provided reference.
    referrer (str|None) A referrer string to help locate the origin of this query.
    """
    snuba_filter = get_filter(query, params)
    snuba_args = {
        "start": snuba_filter.start,
        "end": snuba_filter.end,
        "conditions": snuba_filter.conditions,
        "filter_keys": snuba_filter.filter_keys,
        "having": snuba_filter.having,
    }
    if not snuba_args["start"] and not snuba_args["end"]:
        raise InvalidSearchQuery(
            "Cannot get timeseries result without a start and end.")

    snuba_args.update(
        resolve_field_list(selected_columns, snuba_args, auto_fields=False))
    if reference_event:
        ref_conditions = create_reference_event_conditions(reference_event)
        if ref_conditions:
            snuba_args["conditions"].extend(ref_conditions)

    # Resolve the public aliases into the discover dataset names.
    snuba_args, _ = resolve_discover_aliases(snuba_args)
    if not snuba_args["aggregations"]:
        raise InvalidSearchQuery(
            "Cannot get timeseries result with no aggregation.")

    # Change the alias of the first aggregation to count. This ensures compatibility
    # with other parts of the timeseries endpoint expectations
    if len(snuba_args["aggregations"]) == 1:
        snuba_args["aggregations"][0][2] = "count"

    result = raw_query(
        aggregations=snuba_args.get("aggregations"),
        conditions=snuba_args.get("conditions"),
        filter_keys=snuba_args.get("filter_keys"),
        start=snuba_args.get("start"),
        end=snuba_args.get("end"),
        rollup=rollup,
        orderby="time",
        groupby=["time"],
        dataset=Dataset.Discover,
        limit=10000,
        referrer=referrer,
    )
    result = zerofill(result["data"], snuba_args["start"], snuba_args["end"],
                      rollup, "time")

    return SnubaTSResult({"data": result}, snuba_filter.start,
                         snuba_filter.end, rollup)
Ejemplo n.º 31
0
def get_facets(query, params, limit=10, referrer=None):
    """
    High-level API for getting 'facet map' results.

    Facets are high frequency tags and attribute results that
    can be used to further refine user queries. When many projects
    are requested sampling will be enabled to help keep response times low.

    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    limit (int) The number of records to fetch.
    referrer (str|None) A referrer string to help locate the origin of this query.

    Returns Sequence[FacetResult]
    """
    snuba_filter = get_filter(query, params)

    # TODO(mark) Refactor the need for this translation shim.
    snuba_args = {
        "start": snuba_filter.start,
        "end": snuba_filter.end,
        "conditions": snuba_filter.conditions,
        "filter_keys": snuba_filter.filter_keys,
    }
    # Resolve the public aliases into the discover dataset names.
    snuba_args, translated_columns = resolve_discover_aliases(snuba_args)

    # Exclude tracing tags as they are noisy and generally not helpful.
    excluded_tags = [
        "tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span", "project"]
    ]

    # Sampling keys for multi-project results as we don't need accuracy
    # with that much data.
    sample = len(snuba_filter.filter_keys["project_id"]) > 2

    # Get the most frequent tag keys
    key_names = raw_query(
        aggregations=[["count", None, "count"]],
        start=snuba_args.get("start"),
        end=snuba_args.get("end"),
        conditions=snuba_args.get("conditions"),
        filter_keys=snuba_args.get("filter_keys"),
        orderby=["-count", "tags_key"],
        groupby="tags_key",
        having=[excluded_tags],
        dataset=Dataset.Discover,
        limit=limit,
        referrer=referrer,
        turbo=sample,
    )
    top_tags = [r["tags_key"] for r in key_names["data"]]
    if not top_tags:
        return []

    # TODO(mark) Make the sampling rate scale based on the result size and scaling factor in
    # sentry.options. To test the lowest acceptable sampling rate, we use 0.1 which
    # is equivalent to turbo. We don't use turbo though as we need to re-scale data, and
    # using turbo could cause results to be wrong if the value of turbo is changed in snuba.
    sample_rate = 0.1 if key_names["data"][0]["count"] > 10000 else None
    # Rescale the results if we're sampling
    multiplier = 1 / sample_rate if sample_rate is not None else 1

    fetch_projects = False
    if len(params.get("project_id", [])) > 1:
        if len(top_tags) == limit:
            top_tags.pop()
        fetch_projects = True

    results = []
    if fetch_projects:
        project_values = raw_query(
            aggregations=[["count", None, "count"]],
            start=snuba_args.get("start"),
            end=snuba_args.get("end"),
            conditions=snuba_args.get("conditions"),
            filter_keys=snuba_args.get("filter_keys"),
            groupby="project_id",
            orderby="-count",
            dataset=Dataset.Discover,
            referrer=referrer,
            sample=sample_rate,
            # Ensures Snuba will not apply FINAL
            turbo=sample_rate is not None,
        )
        results.extend([
            FacetResult("project", r["project_id"],
                        int(r["count"]) * multiplier)
            for r in project_values["data"]
        ])

    # Get tag counts for our top tags. Fetching them individually
    # allows snuba to leverage promoted tags better and enables us to get
    # the value count we want.
    max_aggregate_tags = options.get("discover2.max_tags_to_combine")
    individual_tags = []
    aggregate_tags = []
    for i, tag in enumerate(top_tags):
        if tag == "environment":
            # Add here tags that you want to be individual
            individual_tags.append(tag)
        elif i >= len(top_tags) - max_aggregate_tags:
            aggregate_tags.append(tag)
        else:
            individual_tags.append(tag)

    for tag_name in individual_tags:
        tag = u"tags[{}]".format(tag_name)
        tag_values = raw_query(
            aggregations=[["count", None, "count"]],
            conditions=snuba_args.get("conditions"),
            start=snuba_args.get("start"),
            end=snuba_args.get("end"),
            filter_keys=snuba_args.get("filter_keys"),
            orderby=["-count"],
            groupby=[tag],
            limit=TOP_VALUES_DEFAULT_LIMIT,
            dataset=Dataset.Discover,
            referrer=referrer,
            sample=sample_rate,
            # Ensures Snuba will not apply FINAL
            turbo=sample_rate is not None,
        )
        results.extend([
            FacetResult(tag_name, r[tag],
                        int(r["count"]) * multiplier)
            for r in tag_values["data"]
        ])

    if aggregate_tags:
        conditions = snuba_args.get("conditions", [])
        conditions.append(["tags_key", "IN", aggregate_tags])
        tag_values = raw_query(
            aggregations=[["count", None, "count"]],
            conditions=conditions,
            start=snuba_args.get("start"),
            end=snuba_args.get("end"),
            filter_keys=snuba_args.get("filter_keys"),
            orderby=["tags_key", "-count"],
            groupby=["tags_key", "tags_value"],
            dataset=Dataset.Discover,
            referrer=referrer,
            sample=sample_rate,
            # Ensures Snuba will not apply FINAL
            turbo=sample_rate is not None,
            limitby=[TOP_VALUES_DEFAULT_LIMIT, "tags_key"],
        )
        results.extend([
            FacetResult(r["tags_key"], r["tags_value"],
                        int(r["count"]) * multiplier)
            for r in tag_values["data"]
        ])

    return results
Ejemplo n.º 32
0
    def get_attrs(self, item_list, user):
        if not self._collapse("base"):
            attrs = super().get_attrs(item_list, user)
        else:
            seen_stats = self._get_seen_stats(item_list, user)
            if seen_stats:
                attrs = {item: seen_stats.get(item, {}) for item in item_list}
            else:
                attrs = {item: {} for item in item_list}

        if self.stats_period and not self._collapse("stats"):
            partial_get_stats = functools.partial(
                self.get_stats,
                item_list=item_list,
                user=user,
                environment_ids=self.environment_ids)
            stats = partial_get_stats()
            filtered_stats = (partial_get_stats(
                conditions=self.conditions) if self.conditions
                              and not self._collapse("filtered") else None)
            for item in item_list:
                if filtered_stats:
                    attrs[item].update(
                        {"filtered_stats": filtered_stats[item.id]})
                attrs[item].update({"stats": stats[item.id]})

            if self._expand("sessions"):
                uniq_project_ids = list(
                    {item.project_id
                     for item in item_list})
                cache_keys = {
                    pid: self._build_session_cache_key(pid)
                    for pid in uniq_project_ids
                }
                cache_data = cache.get_many(cache_keys.values())
                missed_items = []
                for item in item_list:
                    num_sessions = cache_data.get(cache_keys[item.project_id])
                    if num_sessions is None:
                        found = "miss"
                        missed_items.append(item)
                    else:
                        found = "hit"
                        attrs[item].update({
                            "sessionCount": num_sessions,
                        })
                    metrics.incr(f"group.get_session_counts.{found}")

                if missed_items:
                    filters = {
                        "project_id":
                        list({item.project_id
                              for item in missed_items})
                    }
                    if self.environment_ids:
                        filters["environment"] = self.environment_ids

                    result_totals = raw_query(
                        selected_columns=["sessions"],
                        dataset=Dataset.Sessions,
                        start=self.start,
                        end=self.end,
                        filter_keys=filters,
                        groupby=["project_id"],
                        referrer=
                        "serializers.GroupSerializerSnuba.session_totals",
                    )
                    results = {}
                    for data in result_totals["data"]:
                        cache_key = self._build_session_cache_key(
                            data["project_id"])
                        results[data["project_id"]] = data["sessions"]
                        cache.set(cache_key, data["sessions"], 3600)

                    for item in missed_items:
                        if item.project_id in results.keys():
                            attrs[item].update({
                                "sessionCount":
                                results[item.project_id],
                            })
                        else:
                            attrs[item].update({"sessionCount": None})

        if self._expand("inbox"):
            inbox_stats = get_inbox_details(item_list)
            for item in item_list:
                attrs[item].update({"inbox": inbox_stats.get(item.id)})

        if self._expand("owners"):
            owner_details = get_owner_details(item_list)
            for item in item_list:
                attrs[item].update({"owners": owner_details.get(item.id)})

        return attrs
Ejemplo n.º 33
0
    def validate(self, data):
        """
        Performs validation on an alert rule's data.
        This includes ensuring there is either 1 or 2 triggers, which each have
        actions, and have proper thresholds set. The critical trigger should
        both alert and resolve 'after' the warning trigger (whether that means
        > or < the value depends on threshold type).
        """
        data.setdefault("dataset", QueryDatasets.EVENTS)
        project_id = data.get("projects")
        if not project_id:
            # We just need a valid project id from the org so that we can verify
            # the query. We don't use the returned data anywhere, so it doesn't
            # matter which.
            project_id = list(self.context["organization"].project_set.all()[:1])
        try:
            snuba_filter = build_snuba_filter(
                data["dataset"],
                data["query"],
                data["aggregate"],
                data.get("environment"),
                data.get("event_types"),
                params={
                    "project_id": [p.id for p in project_id],
                    "start": timezone.now() - timedelta(minutes=10),
                    "end": timezone.now(),
                },
            )
            if any(cond[0] == "project_id" for cond in snuba_filter.conditions):
                raise serializers.ValidationError({"query": "Project is an invalid search term"})
        except (InvalidSearchQuery, ValueError) as e:
            raise serializers.ValidationError(f"Invalid Query or Metric: {e}")
        else:
            if not snuba_filter.aggregations:
                raise serializers.ValidationError(
                    "Invalid Metric: Please pass a valid function for aggregation"
                )

            try:
                raw_query(
                    aggregations=snuba_filter.aggregations,
                    start=snuba_filter.start,
                    end=snuba_filter.end,
                    conditions=snuba_filter.conditions,
                    filter_keys=snuba_filter.filter_keys,
                    having=snuba_filter.having,
                    dataset=Dataset(data["dataset"].value),
                    limit=1,
                    referrer="alertruleserializer.test_query",
                )
            except Exception:
                logger.exception("Error while validating snuba alert rule query")
                raise serializers.ValidationError(
                    "Invalid Query or Metric: An error occurred while attempting "
                    "to run the query"
                )

        triggers = data.get("triggers", [])
        if not triggers:
            raise serializers.ValidationError("Must include at least one trigger")
        if len(triggers) > 2:
            raise serializers.ValidationError(
                "Must send 1 or 2 triggers - A critical trigger, and an optional warning trigger"
            )

        event_types = data.get("event_types")

        valid_event_types = dataset_valid_event_types[data["dataset"]]
        if event_types and set(event_types) - valid_event_types:
            raise serializers.ValidationError(
                "Invalid event types for this dataset. Valid event types are %s"
                % sorted([et.name.lower() for et in valid_event_types])
            )

        for i, (trigger, expected_label) in enumerate(
            zip(triggers, (CRITICAL_TRIGGER_LABEL, WARNING_TRIGGER_LABEL))
        ):
            if trigger.get("label", None) != expected_label:
                raise serializers.ValidationError(
                    f'Trigger {i + 1} must be labeled "{expected_label}"'
                )
        critical = triggers[0]
        threshold_type = data["threshold_type"]

        self._validate_trigger_thresholds(threshold_type, critical, data.get("resolve_threshold"))

        if len(triggers) == 2:
            warning = triggers[1]
            self._validate_trigger_thresholds(
                threshold_type, warning, data.get("resolve_threshold")
            )
            self._validate_critical_warning_triggers(threshold_type, critical, warning)

        return data
Ejemplo n.º 34
0
def _get_release_sessions_time_bounds(project_id,
                                      release,
                                      org_id,
                                      environments=None):
    """
    Get the sessions time bounds in terms of when the first session started and
    when the last session started according to a specific (project_id, org_id, release, environments)
    combination
    Inputs:
        * project_id
        * release
        * org_id: Organisation Id
        * environments
    Return:
        Dictionary with two keys "sessions_lower_bound" and "sessions_upper_bound" that
    correspond to when the first session occurred and when the last session occurred respectively
    """
    def iso_format_snuba_datetime(date):
        return datetime.strptime(
            date, "%Y-%m-%dT%H:%M:%S+00:00").isoformat()[:19] + "Z"

    release_sessions_time_bounds = {
        "sessions_lower_bound": None,
        "sessions_upper_bound": None,
    }

    filter_keys = {"project_id": [project_id], "org_id": [org_id]}
    conditions = [["release", "=", release]]
    if environments is not None:
        conditions.append(["environment", "IN", environments])

    rows = raw_query(
        dataset=Dataset.Sessions,
        selected_columns=["first_session_started", "last_session_started"],
        aggregations=[
            ["min(started)", None, "first_session_started"],
            ["max(started)", None, "last_session_started"],
        ],
        conditions=conditions,
        filter_keys=filter_keys,
        referrer="sessions.release-sessions-time-bounds",
    )["data"]

    formatted_unix_start_time = datetime.utcfromtimestamp(0).strftime(
        "%Y-%m-%dT%H:%M:%S+00:00")

    if rows:
        rv = rows[0]

        # This check is added because if there are no sessions found, then the
        # aggregations query return both the sessions_lower_bound and the
        # sessions_upper_bound as `0` timestamp and we do not want that behaviour
        # by default
        # P.S. To avoid confusion the `0` timestamp which is '1970-01-01 00:00:00'
        # is rendered as '0000-00-00 00:00:00' in clickhouse shell
        if set(rv.values()) != {formatted_unix_start_time}:
            release_sessions_time_bounds = {
                "sessions_lower_bound":
                iso_format_snuba_datetime(rv["first_session_started"]),
                "sessions_upper_bound":
                iso_format_snuba_datetime(rv["last_session_started"]),
            }
    return release_sessions_time_bounds
Ejemplo n.º 35
0
def get_performance_facets(
    query,
    params,
    orderby=None,
    aggregate_column="duration",
    aggregate_function="avg",
    limit=20,
    referrer=None,
):
    """
    High-level API for getting 'facet map' results for performance data

    Performance facets are high frequency tags and the aggregate duration of
    their most frequent values

    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    limit (int) The number of records to fetch.
    referrer (str|None) A referrer string to help locate the origin of this query.

    Returns Sequence[FacetResult]
    """
    with sentry_sdk.start_span(
        op="discover.discover", description="facets.filter_transform"
    ) as span:
        span.set_data("query", query)
        snuba_filter = get_filter(query, params)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(snuba_filter)

    # Exclude tracing tags as they are noisy and generally not helpful.
    # TODO(markus): Tracing tags are no longer written but may still reside in DB.
    excluded_tags = ["tags_key", "NOT IN", ["trace", "trace.ctx", "trace.span", "project"]]

    # Sampling keys for multi-project results as we don't need accuracy
    # with that much data.
    sample = len(snuba_filter.filter_keys["project_id"]) > 2

    with sentry_sdk.start_span(op="discover.discover", description="facets.frequent_tags"):
        # Get the tag keys with the highest deviation
        key_names = raw_query(
            aggregations=[["stddevSamp", aggregate_column, "stddev"]],
            start=snuba_filter.start,
            end=snuba_filter.end,
            conditions=snuba_filter.conditions,
            filter_keys=snuba_filter.filter_keys,
            orderby=["-stddev", "tags_key"],
            groupby="tags_key",
            # TODO(Kevan): Check using having vs where before mainlining
            having=[excluded_tags],
            dataset=Dataset.Discover,
            limit=limit,
            referrer=referrer,
            turbo=sample,
        )
        top_tags = [r["tags_key"] for r in key_names["data"]]
        if not top_tags:
            return []

    results = []

    sampling_enabled = True
    options_sample_rate = options.get("discover2.tags_performance_facet_sample_rate") or 0.1

    sample_rate = options_sample_rate if sampling_enabled else None

    max_aggregate_tags = 20
    aggregate_tags = []
    for i, tag in enumerate(top_tags):
        if i >= len(top_tags) - max_aggregate_tags:
            aggregate_tags.append(tag)

    if orderby is None:
        orderby = []

    if aggregate_tags:
        with sentry_sdk.start_span(op="discover.discover", description="facets.aggregate_tags"):
            conditions = snuba_filter.conditions
            conditions.append(["tags_key", "IN", aggregate_tags])
            tag_values = raw_query(
                aggregations=[[aggregate_function, aggregate_column, "aggregate"]],
                conditions=conditions,
                start=snuba_filter.start,
                end=snuba_filter.end,
                filter_keys=snuba_filter.filter_keys,
                orderby=orderby + ["tags_key"],
                groupby=["tags_key", "tags_value"],
                dataset=Dataset.Discover,
                referrer=referrer,
                sample=sample_rate,
                turbo=sample_rate is not None,
                limitby=[TOP_VALUES_DEFAULT_LIMIT, "tags_key"],
            )
            results.extend(
                [
                    FacetResult(r["tags_key"], r["tags_value"], int(r["aggregate"]))
                    for r in tag_values["data"]
                ]
            )

    return results
Ejemplo n.º 36
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    referrer=None,
    auto_fields=False,
    auto_aggregations=False,
    use_aggregate_conditions=False,
    conditions=None,
    functions_acl=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    auto_aggregations (bool) Whether aggregates should be added automatically if they're used
                    in conditions, and there's at least one aggregate already.
    use_aggregate_conditions (bool) Set to true if aggregates conditions should be used at all.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    # We clobber this value throughout this code, so copy the value
    selected_columns = selected_columns[:]

    with sentry_sdk.start_span(
        op="discover.discover", description="query.filter_transform"
    ) as span:
        span.set_data("query", query)

        snuba_filter = get_filter(query, params)
        if not use_aggregate_conditions:
            assert (
                not auto_aggregations
            ), "Auto aggregations cannot be used without enabling aggregate conditions"
            snuba_filter.having = []

    function_translations = {}

    with sentry_sdk.start_span(op="discover.discover", description="query.field_translations"):
        if orderby is not None:
            orderby = list(orderby) if isinstance(orderby, (list, tuple)) else [orderby]
            snuba_filter.orderby = [get_function_alias(o) for o in orderby]

        resolved_fields = resolve_field_list(
            selected_columns,
            snuba_filter,
            auto_fields=auto_fields,
            auto_aggregations=auto_aggregations,
            functions_acl=functions_acl,
        )

        snuba_filter.update_with(resolved_fields)

        # Resolve the public aliases into the discover dataset names.
        snuba_filter, translated_columns = resolve_discover_aliases(
            snuba_filter, function_translations
        )

        # Make sure that any aggregate conditions are also in the selected columns
        for having_clause in snuba_filter.having:
            # The first element of the having can be an alias, or a nested array of functions. Loop through to make sure
            # any referenced functions are in the aggregations.
            error_extra = ", and could not be automatically added" if auto_aggregations else ""
            if isinstance(having_clause[0], (list, tuple)):
                # Functions are of the form [fn, [args]]
                args_to_check = [[having_clause[0]]]
                conditions_not_in_aggregations = []
                while len(args_to_check) > 0:
                    args = args_to_check.pop()
                    for arg in args:
                        if arg[0] in [SNUBA_AND, SNUBA_OR]:
                            args_to_check.extend(arg[1])
                        # Only need to iterate on arg[1] if its a list
                        elif isinstance(arg[1], (list, tuple)):
                            alias = arg[1][0]
                            found = any(
                                alias == agg_clause[-1] for agg_clause in snuba_filter.aggregations
                            )
                            if not found:
                                conditions_not_in_aggregations.append(alias)

                if len(conditions_not_in_aggregations) > 0:
                    raise InvalidSearchQuery(
                        "Aggregate(s) {} used in a condition but are not in the selected columns{}.".format(
                            ", ".join(conditions_not_in_aggregations),
                            error_extra,
                        )
                    )
            else:
                found = any(
                    having_clause[0] == agg_clause[-1] for agg_clause in snuba_filter.aggregations
                )
                if not found:
                    raise InvalidSearchQuery(
                        "Aggregate {} used in a condition but is not a selected column{}.".format(
                            having_clause[0],
                            error_extra,
                        )
                    )

        if conditions is not None:
            snuba_filter.conditions.extend(conditions)

    with sentry_sdk.start_span(op="discover.discover", description="query.snuba_query"):
        result = raw_query(
            start=snuba_filter.start,
            end=snuba_filter.end,
            groupby=snuba_filter.groupby,
            conditions=snuba_filter.conditions,
            aggregations=snuba_filter.aggregations,
            selected_columns=snuba_filter.selected_columns,
            filter_keys=snuba_filter.filter_keys,
            having=snuba_filter.having,
            orderby=snuba_filter.orderby,
            dataset=Dataset.Discover,
            limit=limit,
            offset=offset,
            referrer=referrer,
        )

    with sentry_sdk.start_span(
        op="discover.discover", description="query.transform_results"
    ) as span:
        span.set_data("result_count", len(result.get("data", [])))
        return transform_results(
            result, resolved_fields["functions"], translated_columns, snuba_filter, selected_columns
        )
Ejemplo n.º 37
0
def top_events_timeseries(
    timeseries_columns,
    selected_columns,
    user_query,
    params,
    orderby,
    rollup,
    limit,
    organization,
    referrer=None,
    top_events=None,
    allow_empty=True,
):
    """
    High-level API for doing arbitrary user timeseries queries for a limited number of top events

    Returns a dictionary of SnubaTSResult objects that have been zerofilled in
    case of gaps. Each value of the dictionary should match the result of a timeseries query

    timeseries_columns (Sequence[str]) List of public aliases to fetch for the timeseries query,
                    usually matches the y-axis of the graph
    selected_columns (Sequence[str]) List of public aliases to fetch for the events query,
                    this is to determine what the top events are
    user_query (str) Filter query string to create conditions from. needs to be user_query
                    to not conflict with the function query
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment,
    orderby (Sequence[str]) The fields to order results by.
    rollup (int) The bucket width in seconds
    limit (int) The number of events to get timeseries for
    organization (Organization) Used to map group ids to short ids
    referrer (str|None) A referrer string to help locate the origin of this query.
    top_events (dict|None) A dictionary with a 'data' key containing a list of dictionaries that
                    represent the top events matching the query. Useful when you have found
                    the top events earlier and want to save a query.
    """
    if top_events is None:
        with sentry_sdk.start_span(op="discover.discover",
                                   description="top_events.fetch_events"):
            top_events = query(
                selected_columns,
                query=user_query,
                params=params,
                orderby=orderby,
                limit=limit,
                referrer=referrer,
                auto_aggregations=True,
                use_aggregate_conditions=True,
            )

    with sentry_sdk.start_span(
            op="discover.discover",
            description="top_events.filter_transform") as span:
        span.set_data("query", user_query)
        snuba_filter, translated_columns = get_timeseries_snuba_filter(
            list(sorted(set(timeseries_columns + selected_columns))),
            user_query,
            params,
            rollup,
            default_count=False,
        )

        for field in selected_columns:
            # If we have a project field, we need to limit results by project so we dont hit the result limit
            if field in ["project", "project.id"] and top_events["data"]:
                snuba_filter.project_ids = [
                    event["project.id"] for event in top_events["data"]
                ]
                continue
            if field in FIELD_ALIASES:
                field = FIELD_ALIASES[field].alias
            # Note that because orderby shouldn't be an array field its not included in the values
            values = list({
                event.get(field)
                for event in top_events["data"]
                if field in event and not isinstance(event.get(field), list)
            })
            if values:
                # timestamp fields needs special handling, creating a big OR instead
                if field == "timestamp" or field.startswith("timestamp.to_"):
                    snuba_filter.conditions.append([[field, "=", value]
                                                    for value in sorted(values)
                                                    ])
                elif None in values:
                    non_none_values = [
                        value for value in values if value is not None
                    ]
                    condition = [[["isNull", [resolve_discover_column(field)]],
                                  "=", 1]]
                    if non_none_values:
                        condition.append([
                            resolve_discover_column(field), "IN",
                            non_none_values
                        ])
                    snuba_filter.conditions.append(condition)
                elif field in FIELD_ALIASES:
                    snuba_filter.conditions.append([field, "IN", values])
                else:
                    snuba_filter.conditions.append(
                        [resolve_discover_column(field), "IN", values])

    with sentry_sdk.start_span(op="discover.discover",
                               description="top_events.snuba_query"):
        result = raw_query(
            aggregations=snuba_filter.aggregations,
            conditions=snuba_filter.conditions,
            filter_keys=snuba_filter.filter_keys,
            selected_columns=snuba_filter.selected_columns,
            start=snuba_filter.start,
            end=snuba_filter.end,
            rollup=rollup,
            orderby=["time"] + snuba_filter.groupby,
            groupby=["time"] + snuba_filter.groupby,
            dataset=Dataset.Discover,
            limit=10000,
            referrer=referrer,
        )

    if not allow_empty and not len(result.get("data", [])):
        return SnubaTSResult(
            {
                "data":
                zerofill([], snuba_filter.start, snuba_filter.end, rollup,
                         "time")
            },
            snuba_filter.start,
            snuba_filter.end,
            rollup,
        )

    with sentry_sdk.start_span(
            op="discover.discover",
            description="top_events.transform_results") as span:
        span.set_data("result_count", len(result.get("data", [])))
        result = transform_data(result, translated_columns, snuba_filter)

        if "project" in selected_columns:
            translated_columns["project_id"] = "project"
        translated_groupby = [
            translated_columns.get(groupby, groupby)
            for groupby in snuba_filter.groupby
        ]

        issues = {}
        if "issue" in selected_columns:
            issues = Group.issues_mapping(
                {event["issue.id"]
                 for event in top_events["data"]},
                params["project_id"],
                organization,
            )
        # so the result key is consistent
        translated_groupby.sort()

        results = {}
        # Using the top events add the order to the results
        for index, item in enumerate(top_events["data"]):
            result_key = create_result_key(item, translated_groupby, issues)
            results[result_key] = {"order": index, "data": []}
        for row in result["data"]:
            result_key = create_result_key(row, translated_groupby, issues)
            if result_key in results:
                results[result_key]["data"].append(row)
            else:
                logger.warning(
                    "discover.top-events.timeseries.key-mismatch",
                    extra={
                        "result_key": result_key,
                        "top_event_keys": list(results.keys())
                    },
                )
        for key, item in results.items():
            results[key] = SnubaTSResult(
                {
                    "data":
                    zerofill(item["data"], snuba_filter.start,
                             snuba_filter.end, rollup, "time"),
                    "order":
                    item["order"],
                },
                snuba_filter.start,
                snuba_filter.end,
                rollup,
            )

    return results
Ejemplo n.º 38
0
def snuba_search(start,
                 end,
                 project_ids,
                 environment_ids,
                 tags,
                 sort_field,
                 cursor=None,
                 candidate_ids=None,
                 limit=None,
                 offset=0,
                 get_sample=False,
                 search_filters=None,
                 use_new_filters=False,
                 **parameters):
    """
    This function doesn't strictly benefit from or require being pulled out of the main
    query method above, but the query method is already large and this function at least
    extracts most of the Snuba-specific logic.

    Returns a tuple of:
     * a sorted list of (group_id, group_score) tuples sorted descending by score,
     * the count of total results (rows) available for this query.
    """

    from sentry.search.base import ANY

    filters = {
        'project_id': project_ids,
    }

    if environment_ids is not None:
        filters['environment'] = environment_ids

    if candidate_ids is not None:
        filters['issue'] = candidate_ids

    conditions = []
    if use_new_filters:
        having = []
        for search_filter in search_filters:
            if (
                    # Don't filter on issue fields here, they're not available
                    search_filter.key.name in issue_only_fields
                    # We special case date
                    or search_filter.key.name == 'date'):
                continue
            converted_filter = convert_search_filter_to_snuba_query(
                search_filter)
            if search_filter.key.name in aggregation_defs:
                having.append(converted_filter)
            else:
                conditions.append(converted_filter)
    else:
        having = SnubaConditionBuilder({
            'age_from':
            ScalarCondition('first_seen', '>'),
            'age_to':
            ScalarCondition('first_seen', '<'),
            'last_seen_from':
            ScalarCondition('last_seen', '>'),
            'last_seen_to':
            ScalarCondition('last_seen', '<'),
            'times_seen':
            CallbackCondition(
                lambda times_seen: ('times_seen', '=', times_seen), ),
            'times_seen_lower':
            ScalarCondition('times_seen', '>'),
            'times_seen_upper':
            ScalarCondition('times_seen', '<'),
        }).build(parameters)

        for tag, val in sorted(tags.items()):
            col = u'tags[{}]'.format(tag)
            if val == ANY:
                conditions.append((col, '!=', ''))
            else:
                conditions.append((col, '=', val))

    extra_aggregations = dependency_aggregations.get(sort_field, [])
    required_aggregations = set([sort_field, 'total'] + extra_aggregations)
    for h in having:
        alias = h[0]
        required_aggregations.add(alias)

    aggregations = []
    for alias in required_aggregations:
        aggregations.append(aggregation_defs[alias] + [alias])

    if cursor is not None:
        having.append(
            (sort_field, '>=' if cursor.is_prev else '<=', cursor.value))

    selected_columns = []
    if get_sample:
        # Get a random sample of matching groups. Because we use any(rand()),
        # we are testing against a single random value per group, and so the
        # sample is independent of the number of events in a group. Since we
        # are sampling using `ORDER by random() LIMIT x`, we will always grab
        # the full result set if there less than x total results.

        query_hash = md5(repr(conditions)).hexdigest()[:8]
        selected_columns.append(
            ('cityHash64', ("'{}'".format(query_hash), 'issue'), 'sample'))
        sort_field = 'sample'
        orderby = [sort_field]
        referrer = 'search_sample'
    else:
        # Get the top matching groups by score, i.e. the actual search results
        # in the order that we want them.
        orderby = ['-{}'.format(sort_field),
                   'issue']  # ensure stable sort within the same score
        referrer = 'search'

    snuba_results = snuba.raw_query(
        start=start,
        end=end,
        selected_columns=selected_columns,
        groupby=['issue'],
        conditions=conditions,
        having=having,
        filter_keys=filters,
        aggregations=aggregations,
        orderby=orderby,
        referrer=referrer,
        limit=limit,
        offset=offset,
        totals=
        True,  # Needs to have totals_mode=after_having_exclusive so we get groups matching HAVING only
        turbo=get_sample,  # Turn off FINAL when in sampling mode
        sample=1,  # Don't use clickhouse sampling, even when in turbo mode.
    )
    rows = snuba_results['data']
    total = snuba_results['totals']['total']

    if not get_sample:
        metrics.timing('snuba.search.num_result_groups', len(rows))

    return [(row['issue'], row[sort_field]) for row in rows], total
Ejemplo n.º 39
0
def query(
    selected_columns,
    query,
    params,
    orderby=None,
    offset=None,
    limit=50,
    referrer=None,
    auto_fields=False,
    auto_aggregations=False,
    use_aggregate_conditions=False,
    conditions=None,
    functions_acl=None,
):
    """
    High-level API for doing arbitrary user queries against events.

    This function operates on the Discover public event schema and
    virtual fields/aggregate functions for selected columns and
    conditions are supported through this function.

    The resulting list will have all internal field names mapped
    back into their public schema names.

    selected_columns (Sequence[str]) List of public aliases to fetch.
    query (str) Filter query string to create conditions from.
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment
    orderby (None|str|Sequence[str]) The field to order results by.
    offset (None|int) The record offset to read.
    limit (int) The number of records to fetch.
    referrer (str|None) A referrer string to help locate the origin of this query.
    auto_fields (bool) Set to true to have project + eventid fields automatically added.
    auto_aggregations (bool) Whether aggregates should be added automatically if they're used
                    in conditions, and there's at least one aggregate already.
    use_aggregate_conditions (bool) Set to true if aggregates conditions should be used at all.
    conditions (Sequence[any]) List of conditions that are passed directly to snuba without
                    any additional processing.
    """
    if not selected_columns:
        raise InvalidSearchQuery("No columns selected")

    # We clobber this value throughout this code, so copy the value
    selected_columns = selected_columns[:]

    snuba_query = prepare_discover_query(
        selected_columns,
        query,
        params,
        orderby,
        auto_fields,
        auto_aggregations,
        use_aggregate_conditions,
        conditions,
        functions_acl,
    )
    snuba_filter = snuba_query.filter

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.snuba_query"):
        result = raw_query(
            start=snuba_filter.start,
            end=snuba_filter.end,
            groupby=snuba_filter.groupby,
            conditions=snuba_filter.conditions,
            aggregations=snuba_filter.aggregations,
            selected_columns=snuba_filter.selected_columns,
            filter_keys=snuba_filter.filter_keys,
            having=snuba_filter.having,
            orderby=snuba_filter.orderby,
            dataset=Dataset.Discover,
            limit=limit,
            offset=offset,
            referrer=referrer,
        )

    with sentry_sdk.start_span(op="discover.discover",
                               description="query.transform_results") as span:
        span.set_data("result_count", len(result.get("data", [])))
        return transform_results(
            result,
            snuba_query.fields["functions"],
            snuba_query.columns,
            snuba_filter,
        )
Ejemplo n.º 40
0
def top_events_timeseries(
    timeseries_columns,
    selected_columns,
    user_query,
    params,
    orderby,
    rollup,
    limit,
    organization,
    referrer=None,
):
    """
    High-level API for doing arbitrary user timeseries queries for a limited number of top events

    Returns a dictionary of SnubaTSResult objects that have been zerofilled in
    case of gaps. Each value of the dictionary should match the result of a timeseries query

    timeseries_columns (Sequence[str]) List of public aliases to fetch for the timeseries query,
                        usually matches the y-axis of the graph
    selected_columns (Sequence[str]) List of public aliases to fetch for the events query,
                        this is to determine what the top events are
    user_query (str) Filter query string to create conditions from. needs to be user_query
                        to not conflict with the function query
    params (Dict[str, str]) Filtering parameters with start, end, project_id, environment,
    orderby (Sequence[str]) The fields to order results by.
    rollup (int) The bucket width in seconds
    limit (int) The number of events to get timeseries for
    organization (Organization) Used to map group ids to short ids
    referrer (str|None) A referrer string to help locate the origin of this query.
    """
    top_events = query(
        selected_columns,
        query=user_query,
        params=params,
        orderby=orderby,
        limit=limit,
        referrer=referrer,
    )

    snuba_filter, translated_columns = get_timeseries_snuba_filter(
        timeseries_columns + selected_columns, user_query, params, rollup)

    user_fields = FIELD_ALIASES["user"]["fields"]

    for field in selected_columns:
        # project is handled by filter_keys already
        if field in ["project", "project.id"]:
            continue
        values = list({
            event.get(field)
            for event in top_events["data"] if field in event
        })
        if values and all(value is not None for value in values):
            # timestamp needs special handling, creating a big OR instead
            if field == "timestamp":
                snuba_filter.conditions.append([["timestamp", "=", value]
                                                for value in values])
            # A user field can be any of its field aliases, do an OR across all the user fields
            elif field == "user":
                snuba_filter.conditions.append(
                    [[resolve_column(user_field), "IN", values]
                     for user_field in user_fields])
            else:
                snuba_filter.conditions.append(
                    [resolve_column(field), "IN", values])

    result = raw_query(
        aggregations=snuba_filter.aggregations,
        conditions=snuba_filter.conditions,
        filter_keys=snuba_filter.filter_keys,
        start=snuba_filter.start,
        end=snuba_filter.end,
        rollup=rollup,
        orderby="time",
        groupby=["time"] + snuba_filter.groupby,
        dataset=Dataset.Discover,
        limit=10000,
        referrer=referrer,
    )

    result = transform_results(result, translated_columns, snuba_filter,
                               selected_columns)

    translated_columns["project_id"] = "project"
    translated_groupby = [
        translated_columns.get(field, field) for field in snuba_filter.groupby
    ]

    if "user" in selected_columns:
        # Determine user related fields to prune based on what wasn't selected, since transform_results does the same
        for field in user_fields:
            if field not in selected_columns:
                translated_groupby.remove(field)
        translated_groupby.append("user")
    issues = {}
    if "issue" in selected_columns:
        issues = Group.issues_mapping(
            set([event["issue.id"] for event in top_events["data"]]),
            params["project_id"],
            organization,
        )
    # so the result key is consistent
    translated_groupby.sort()

    results = {}
    for row in result["data"]:
        values = []
        for field in translated_groupby:
            if field == "issue.id":
                values.append(issues.get(row["issue.id"], "unknown"))
            else:
                values.append(six.text_type(row.get(field)))
        result_key = ",".join(values)
        results.setdefault(result_key, []).append(row)
    for key, item in six.iteritems(results):
        results[key] = SnubaTSResult(
            {
                "data":
                zerofill(item, snuba_filter.start, snuba_filter.end, rollup,
                         "time")
            },
            snuba_filter.start,
            snuba_filter.end,
            rollup,
        )

    return results
Ejemplo n.º 41
0
def get_project_release_stats(project_id,
                              release,
                              stat,
                              rollup,
                              start,
                              end,
                              environments=None):
    assert stat in ("users", "sessions")

    # since snuba end queries are exclusive of the time and we're bucketing to
    # a full hour, we need to round to the next hour since snuba is exclusive
    # on the end.
    end = to_datetime(
        (to_timestamp(end) // DATASET_BUCKET + 1) * DATASET_BUCKET)

    filter_keys = {"project_id": [project_id]}
    conditions = [["release", "=", release]]
    if environments is not None:
        conditions.append(["environment", "IN", environments])

    buckets = int((end - start).total_seconds() / rollup)
    stats = _make_stats(start, rollup, buckets, default=None)

    # Due to the nature of the probabilistic data structures some
    # subtractions can become negative.  As such we're making sure a number
    # never goes below zero to avoid confusion.

    totals = {
        stat: 0,
        stat + "_healthy": 0,
        stat + "_crashed": 0,
        stat + "_abnormal": 0,
        stat + "_errored": 0,
    }

    for rv in raw_query(
            dataset=Dataset.Sessions,
            selected_columns=[
                "bucketed_started",
                stat,
                stat + "_crashed",
                stat + "_abnormal",
                stat + "_errored",
                "duration_quantiles",
            ],
            groupby=["bucketed_started"],
            start=start,
            end=end,
            rollup=rollup,
            conditions=conditions,
            filter_keys=filter_keys,
    )["data"]:
        ts = parse_snuba_datetime(rv["bucketed_started"])
        bucket = int((ts - start).total_seconds() / rollup)
        stats[bucket][1] = {
            stat:
            rv[stat],
            stat + "_healthy":
            max(0, rv[stat] - rv[stat + "_errored"]),
            stat + "_crashed":
            rv[stat + "_crashed"],
            stat + "_abnormal":
            rv[stat + "_abnormal"],
            stat + "_errored":
            max(
                0, rv[stat + "_errored"] - rv[stat + "_crashed"] -
                rv[stat + "_abnormal"]),
            "duration_p50":
            _convert_duration(rv["duration_quantiles"][0]),
            "duration_p90":
            _convert_duration(rv["duration_quantiles"][1]),
        }

        # Session stats we can sum up directly without another query
        # as the data becomes available.
        if stat == "sessions":
            for k in totals:
                totals[k] += stats[bucket][1][k]

    for idx, bucket in enumerate(stats):
        if bucket[1] is None:
            stats[idx][1] = {
                stat: 0,
                stat + "_healthy": 0,
                stat + "_crashed": 0,
                stat + "_abnormal": 0,
                stat + "_errored": 0,
                "duration_p50": None,
                "duration_p90": None,
            }

    # For users we need a secondary query over the entire time range
    if stat == "users":
        rows = raw_query(
            dataset=Dataset.Sessions,
            selected_columns=[
                "users", "users_crashed", "users_abnormal", "users_errored"
            ],
            start=start,
            end=end,
            conditions=conditions,
            filter_keys=filter_keys,
        )["data"]
        if rows:
            rv = rows[0]
            totals = {
                "users":
                rv["users"],
                "users_healthy":
                max(0, rv["users"] - rv["users_errored"]),
                "users_crashed":
                rv["users_crashed"],
                "users_abnormal":
                rv["users_abnormal"],
                "users_errored":
                max(
                    0, rv["users_errored"] - rv["users_crashed"] -
                    rv["users_abnormal"]),
            }

    return stats, totals
Ejemplo n.º 42
0
def _get_release_adoption(project_releases, environments=None, now=None):
    """Get the adoption of the last 24 hours (or a difference reference timestamp)."""
    conditions, filter_keys = _get_conditions_and_filter_keys(
        project_releases, environments)
    if now is None:
        now = datetime.now(pytz.utc)
    start = now - timedelta(days=1)

    total_conditions = []
    if environments is not None:
        total_conditions.append(["environment", "IN", environments])

    # Users Adoption
    total_users = {}
    # Session Adoption
    total_sessions = {}

    for x in raw_query(
            dataset=Dataset.Sessions,
            selected_columns=["project_id", "users", "sessions"],
            groupby=["project_id"],
            start=start,
            conditions=total_conditions,
            filter_keys=filter_keys,
            referrer="sessions.release-adoption-total-users-and-sessions",
    )["data"]:
        total_users[x["project_id"]] = x["users"]
        total_sessions[x["project_id"]] = x["sessions"]

    rv = {}
    for x in raw_query(
            dataset=Dataset.Sessions,
            selected_columns=["release", "project_id", "users", "sessions"],
            groupby=["release", "project_id"],
            start=start,
            conditions=conditions,
            filter_keys=filter_keys,
            referrer="sessions.release-adoption-list",
    )["data"]:
        # Users Adoption
        total_users_count = total_users.get(x["project_id"])

        users_adoption = None
        if total_users_count:
            users_adoption = float(x["users"]) / total_users_count * 100

        # Sessions Adoption
        total_sessions_count = total_sessions.get(x["project_id"])

        sessions_adoption = None
        if total_sessions_count:
            sessions_adoption = float(x["sessions"] / total_sessions_count *
                                      100)

        rv[x["project_id"], x["release"]] = {
            "adoption": users_adoption,
            "sessions_adoption": sessions_adoption,
            "users_24h": x["users"],
            "sessions_24h": x["sessions"],
            "project_users_24h": total_users_count,
            "project_sessions_24h": total_sessions_count,
        }

    return rv
Ejemplo n.º 43
0
def find_histogram_buckets(field, params, conditions):
    match = is_function(field)
    if not match:
        raise InvalidSearchQuery(
            u"received {}, expected histogram function".format(field))

    columns = [
        c.strip() for c in match.group("columns").split(",")
        if len(c.strip()) > 0
    ]

    if len(columns) != 2:
        raise InvalidSearchQuery(
            u"histogram(...) expects 2 column arguments, received {:g} arguments"
            .format(len(columns)))

    column = columns[0]
    # TODO evanh: This can be expanded to more fields at a later date, for now keep this limited.
    if column != "transaction.duration":
        raise InvalidSearchQuery(
            "histogram(...) can only be used with the transaction.duration column"
        )

    try:
        num_buckets = int(columns[1])
        if num_buckets < 1 or num_buckets > 500:
            raise Exception()
    except Exception:
        raise InvalidSearchQuery(
            u"histogram(...) requires a bucket value between 1 and 500, not {}"
            .format(columns[1]))

    alias = u"max_{}".format(column)

    conditions = deepcopy(conditions) if conditions else []
    found = False
    for cond in conditions:
        if (cond[0], cond[1], cond[2]) == ("event.type", "=", "transaction"):
            found = True
    if not found:
        conditions.append(["event.type", "=", "transaction"])
    translated_args, _ = resolve_discover_aliases({"conditions": conditions})

    results = raw_query(
        filter_keys={"project_id": params.get("project_id")},
        start=params.get("start"),
        end=params.get("end"),
        dataset=Dataset.Discover,
        conditions=translated_args["conditions"],
        aggregations=[["max", "duration", alias]],
    )
    if len(results["data"]) != 1:
        # If there are no transactions, so no max duration, return one empty bucket
        return "histogram({}, 1, 1)".format(column)

    bucket_max = results["data"][0][alias]
    if bucket_max == 0:
        raise InvalidSearchQuery(
            u"Cannot calculate histogram for {}".format(field))

    bucket_number = ceil(bucket_max / float(num_buckets))

    return "histogram({}, {:g}, {:g})".format(column, num_buckets,
                                              bucket_number)