def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # pre-filter query candidate_hashes = dict( GroupHash.objects.filter( group__in=group_queryset ).values_list( 'hash', 'group_id' )[:MAX_PRE_SNUBA_CANDIDATES + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_hashes)) if not candidate_hashes: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_hashes) > MAX_PRE_SNUBA_CANDIDATES: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the MAX_PRE_SNUBA_CANDIDATES. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_hashes = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] # {group_id: group_score, ...} snuba_groups = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_hashes=candidate_hashes, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) if candidate_hashes: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering result_groups = snuba_groups.items() else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates result_groups = [] i = 0 for i, chunk in enumerate(chunked(snuba_groups.items(), MAX_POST_SNUBA_CHUNK), 1): filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in chunk] ).values_list('id', flat=True) result_groups.extend( (group_id, snuba_groups[group_id]) for group_id in filtered_group_ids ) metrics.timing('snuba.search.num_post_filters', i) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) # When its a simple django-only search, we count_hits like normal return paginator.get_result(limit, cursor, count_hits=count_hits) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # Here we check if all the django filters reduce the set of groups down # to something that we can send down to Snuba in a `group_id IN (...)` # clause. max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > max_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 hits = None paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() if count_hits and candidate_ids is None: # If we have no candidates, get a random sample of groups matching # the snuba side of the query, and see how many of those pass the # post-filter in postgres. This should give us an estimate of the # total number of snuba matches that will be overall matches, which # we can use to get an estimate for X-Hits. Note no cursor, so we # are always estimating the total hits. # The number of samples required to achieve a certain error bound # with a certain confidence interval can be calculated from a # rearrangement of the normal approximation (Wald) confidence # interval formula: # # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval # # Effectively if we want the estimate to be within +/- 10% of the # real value with 95% confidence, we would need (1.96^2 * p*(1-p)) # / 0.1^2 samples. With a starting assumption of p=0.5 (this # requires the most samples) we would need 96 samples to achieve # +/-10% @ 95% confidence. sample_size = options.get('snuba.search.hits-sample-size') snuba_groups, snuba_total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, limit=sample_size, offset=0, get_sample=True, **parameters ) snuba_count = len(snuba_groups) if snuba_count == 0: return EMPTY_RESULT else: filtered_count = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).count() hit_ratio = filtered_count / float(snuba_count) hits = int(hit_ratio * snuba_total) # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, total = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) count = len(snuba_groups) more_results = count >= limit and (offset + limit) < total offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, so we're # finished with filtering and these are the only results. Note # that because we set the chunk size to at least the size of # the candidate_ids, we know we got all of them (ie there are # no more chunks after the first) result_groups = snuba_groups if count_hits: hits = len(snuba_groups) else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) if count_hits: if not more_results: # We know we have got all possible groups from snuba and filtered # them all down, so we have all hits. # TODO this probably doesn't work because we could be on page N # and not be including hits from previous pages. hits = len(result_groups) else: # We also could have underestimated hits from our sample and have # already seen more hits than the estimate, so make sure hits is # at least as big as what we have seen. hits = max(hits, len(result_groups)) # TODO do we actually have to rebuild this SequencePaginator every time # or can we just make it after we've broken out of the loop? paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, known_hits=hits) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the future, but apparently # `retention_window_start` can be None? start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), '{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # TODO: If the query didn't include anything to significantly filter # down the number of groups at this point ('first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then this # queryset might return a *huge* number of groups. In this case, we # probably *don't* want to pass candidates down to Snuba, and rather we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba. # # However, if this did filter down the number of groups significantly, # then passing in candidates is, of course, valuable. # # Should we decide which way to handle it based on the number of # group_ids, the number of hashes? Or should we just always start the # query with Snuba? Something else? candidate_group_ids = list(group_queryset.values_list('id', flat=True)) sort, extra_aggregations, calculate_cursor_for_group = sort_strategies[sort_by] group_data = do_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, candidates=candidate_group_ids, **parameters ) group_to_score = {} for group_id, data in group_data.items(): group_to_score[group_id] = calculate_cursor_for_group(data) paginator_results = SequencePaginator( [(score, id) for (id, score) in group_to_score.items()], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=count_hits) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, project, retention_window_start, group_queryset, tags, environment, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. now = timezone.now() end = parameters.get('date_to') or (now + ALLOWED_FUTURE_DELTA) # TODO: Presumably we want to search back to the project's full retention, # which may be higher than 90 days in the past, but apparently # `retention_window_start` can be None(?), so we need a fallback. start = max( filter(None, [ retention_window_start, parameters.get('date_from'), now - timedelta(days=90) ]) ) assert start < end # TODO: It's possible `first_release` could be handled by Snuba. if environment is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[project.organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} = %s'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ), ], params=[environment.id], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=project.organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) # maximum number of Group IDs to send down to Snuba, # if more Group ID candidates are found, a "bare" Snuba # search is performed and the result groups are then # post-filtered via queries to the Sentry DB max_pre_snuba_candidates = options.get('snuba.search.max-pre-snuba-candidates') # pre-filter query candidate_ids = None if max_pre_snuba_candidates and limit <= max_pre_snuba_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:max_pre_snuba_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates') return Paginator(Group.objects.none()).get_result() elif len(candidate_ids) > max_pre_snuba_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `max_pre_snuba_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates') candidate_ids = None sort, extra_aggregations, score_fn = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = Paginator(Group.objects.none()).get_result() result_groups = [] result_group_ids = set() min_score = float('inf') max_score = -1 max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( project_id=project.id, environment_id=environment and environment.id, tags=tags, start=start, end=end, sort=sort, extra_aggregations=extra_aggregations, score_fn=score_fn, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) # used for cursor logic min_score = min(min_score, group_score) max_score = max(max_score, group_score) # HACK: If a cursor is being used and there may be more results available # in Snuba, we need to detect whether the cursor's value will be # found in the result groups. If it isn't in the results yet we need to # continue querying before we hand off to the paginator to decide whether # enough results are found or not, otherwise the paginator will happily # return `limit` worth of results that don't take the cursor into account # at all, since it can't know there are more results available. # TODO: If chunked search works in practice we should probably extend the # paginator to throw something if the cursor value is never found, or do # something other than partially leak internal paginator logic up to here. # Or make separate Paginator implementation just for Snuba search? if cursor is not None \ and not candidate_ids \ and more_results: if cursor.is_prev and min_score < cursor.value: continue elif not cursor.is_prev and max_score > cursor.value: continue paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results
def _query(self, projects, retention_window_start, group_queryset, tags, environments, sort_by, limit, cursor, count_hits, paginator_options, **parameters): # TODO: Product decision: we currently search Group.message to handle # the `query` parameter, because that's what we've always done. We could # do that search against every event in Snuba instead, but results may # differ. # TODO: It's possible `first_release` could be handled by Snuba. if environments is not None: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.extra( where=[ '{} = {}'.format( ds.get_sql_column(GroupEnvironment, 'first_release_id'), ds.get_sql_column(Release, 'id'), ), '{} = %s'.format( ds.get_sql_column(Release, 'organization'), ), '{} = %s'.format( ds.get_sql_column(Release, 'version'), ), ], params=[projects[0].organization_id, version], tables=[Release._meta.db_table], ), ), }).build( group_queryset.extra( where=[ u'{} = {}'.format( ds.get_sql_column(Group, 'id'), ds.get_sql_column(GroupEnvironment, 'group_id'), ), u'{} IN ({})'.format( ds.get_sql_column(GroupEnvironment, 'environment_id'), ', '.join(['%s' for e in environments]) ), ], params=[environment.id for environment in environments], tables=[GroupEnvironment._meta.db_table], ), parameters, ) else: group_queryset = ds.QuerySetBuilder({ 'first_release': ds.CallbackCondition( lambda queryset, version: queryset.filter( first_release__organization_id=projects[0].organization_id, first_release__version=version, ), ), }).build( group_queryset, parameters, ) now = timezone.now() end = parameters.get('date_to') if not end: end = now + ALLOWED_FUTURE_DELTA # This search is for some time window that ends with "now", # so if the requested sort is `date` (`last_seen`) and there # are no other Snuba-based search predicates, we can simply # return the results from Postgres. if cursor is None \ and sort_by == 'date' \ and not tags \ and not environments \ and not any(param in parameters for param in [ 'age_from', 'age_to', 'last_seen_from', 'last_seen_to', 'times_seen', 'times_seen_lower', 'times_seen_upper' ]): group_queryset = group_queryset.order_by('-last_seen') paginator = DateTimePaginator(group_queryset, '-last_seen', **paginator_options) return paginator.get_result(limit, cursor, count_hits=False) # TODO: Presumably we only want to search back to the project's max # retention date, which may be closer than 90 days in the past, but # apparently `retention_window_start` can be None(?), so we need a # fallback. retention_date = max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ) start = max( filter(None, [ retention_date, parameters.get('date_from'), ]) ) end = max([ retention_date, end ]) if start == retention_date and end == retention_date: # Both `start` and `end` must have been trimmed to `retention_date`, # so this entire search was against a time range that is outside of # retention. We'll return empty results to maintain backwards compatability # with Django search (for now). return EMPTY_RESULT if start >= end: # TODO: This maintains backwards compatability with Django search, but # in the future we should find a way to notify the user that their search # is invalid. return EMPTY_RESULT # num_candidates is the number of Group IDs to send down to Snuba, if # more Group ID candidates are found, a "bare" Snuba search is performed # and the result groups are then post-filtered via queries to the Sentry DB optimizer_enabled = options.get('snuba.search.pre-snuba-candidates-optimizer') if optimizer_enabled: missed_projects = [] keys = [self._get_project_count_cache_key(p.id) for p in projects] counts_by_projects = { self._get_project_id_from_key(key): count for key, count in cache.get_many(keys).items() } missed_projects = {p.id for p in projects} - set(counts_by_projects.keys()) if missed_projects: missing_counts = snuba.query( start=max( filter(None, [ retention_window_start, now - timedelta(days=90) ]) ), end=now, groupby=['project_id'], filter_keys={ 'project_id': list(missed_projects), }, aggregations=[['uniq', 'group_id', 'group_count']], referrer='search', ) cache.set_many({ self._get_project_count_cache_key(project_id): count for project_id, count in missing_counts.items() }, options.get('snuba.search.project-group-count-cache-time')) counts_by_projects.update(missing_counts) min_candidates = options.get('snuba.search.min-pre-snuba-candidates') max_candidates = options.get('snuba.search.max-pre-snuba-candidates') candidates_percentage = options.get('snuba.search.pre-snuba-candidates-percentage') num_candidates = max( min_candidates, min( max_candidates, sum(counts_by_projects.values()) * candidates_percentage ) ) else: num_candidates = options.get('snuba.search.min-pre-snuba-candidates') # pre-filter query candidate_ids = None if num_candidates and limit <= num_candidates: candidate_ids = list( group_queryset.values_list('id', flat=True)[:num_candidates + 1] ) metrics.timing('snuba.search.num_candidates', len(candidate_ids)) if not candidate_ids: # no matches could possibly be found from this point on metrics.incr('snuba.search.no_candidates', skip_internal=False) return EMPTY_RESULT elif len(candidate_ids) > num_candidates: # If the pre-filter query didn't include anything to significantly # filter down the number of results (from 'first_release', 'query', # 'status', 'bookmarked_by', 'assigned_to', 'unassigned', # 'subscribed_by', 'active_at_from', or 'active_at_to') then it # might have surpassed the `num_candidates`. In this case, # we *don't* want to pass candidates down to Snuba, and instead we # want Snuba to do all the filtering/sorting it can and *then* apply # this queryset to the results from Snuba, which we call # post-filtering. metrics.incr('snuba.search.too_many_candidates', skip_internal=False) candidate_ids = None sort_field = sort_strategies[sort_by] chunk_growth = options.get('snuba.search.chunk-growth-rate') max_chunk_size = options.get('snuba.search.max-chunk-size') chunk_limit = limit offset = 0 num_chunks = 0 paginator_results = EMPTY_RESULT result_groups = [] result_group_ids = set() max_time = options.get('snuba.search.max-total-chunk-time-seconds') time_start = time.time() # Do smaller searches in chunks until we have enough results # to answer the query (or hit the end of possible results). We do # this because a common case for search is to return 100 groups # sorted by `last_seen`, and we want to avoid returning all of # a project's groups and then post-sorting them all in Postgres # when typically the first N results will do. while (time.time() - time_start) < max_time: num_chunks += 1 # grow the chunk size on each iteration to account for huge projects # and weird queries, up to a max size chunk_limit = min(int(chunk_limit * chunk_growth), max_chunk_size) # but if we have candidate_ids always query for at least that many items chunk_limit = max(chunk_limit, len(candidate_ids) if candidate_ids else 0) # {group_id: group_score, ...} snuba_groups, more_results = snuba_search( start=start, end=end, project_ids=[p.id for p in projects], environment_ids=environments and [environment.id for environment in environments], tags=tags, sort_field=sort_field, cursor=cursor, candidate_ids=candidate_ids, limit=chunk_limit, offset=offset, **parameters ) metrics.timing('snuba.search.num_snuba_results', len(snuba_groups)) offset += len(snuba_groups) if not snuba_groups: break if candidate_ids: # pre-filtered candidates were passed down to Snuba, # so we're finished with filtering and these are the # only results result_groups = snuba_groups else: # pre-filtered candidates were *not* passed down to Snuba, # so we need to do post-filtering to verify Sentry DB predicates filtered_group_ids = group_queryset.filter( id__in=[gid for gid, _ in snuba_groups] ).values_list('id', flat=True) group_to_score = dict(snuba_groups) for group_id in filtered_group_ids: if group_id in result_group_ids: # because we're doing multiple Snuba queries, which # happen outside of a transaction, there is a small possibility # of groups moving around in the sort scoring underneath us, # so we at least want to protect against duplicates continue group_score = group_to_score[group_id] result_group_ids.add(group_id) result_groups.append((group_id, group_score)) paginator_results = SequencePaginator( [(score, id) for (id, score) in result_groups], reverse=True, **paginator_options ).get_result(limit, cursor, count_hits=False) # break the query loop for one of three reasons: # * we started with Postgres candidates and so only do one Snuba query max # * the paginator is returning enough results to satisfy the query (>= the limit) # * there are no more groups in Snuba to post-filter if candidate_ids \ or len(paginator_results.results) >= limit \ or not more_results: break # HACK: We're using the SequencePaginator to mask the complexities of going # back and forth between two databases. This causes a problem with pagination # because we're 'lying' to the SequencePaginator (it thinks it has the entire # result set in memory when it does not). For this reason we need to make some # best guesses as to whether the `prev` and `next` cursors have more results. if len(paginator_results.results) == limit and more_results: # Because we are going back and forth between DBs there is a small # chance that we will hand the SequencePaginator exactly `limit` # items. In this case the paginator will assume there are no more # results, so we need to override the `next` cursor's results. paginator_results.next.has_results = True if cursor is not None and (not cursor.is_prev or len(paginator_results.results) > 0): # If the user passed a cursor, and it isn't already a 0 result `is_prev` # cursor, then it's worth allowing them to go back a page to check for # more results. paginator_results.prev.has_results = True metrics.timing('snuba.search.num_chunks', num_chunks) groups = Group.objects.in_bulk(paginator_results.results) paginator_results.results = [groups[k] for k in paginator_results.results if k in groups] return paginator_results