def secondary_should_rules(self, search_query, analyzer): """Return "secondary" should rules for the query. These are the ones using the weakest boosts, they are applied to fields containing more text: description & summary. Applied rules: * Look for phrase matches inside the summary (boost=3.0) * Look for phrase matches inside the description (boost=2.0). If we're using a supported language, both rules are done through a multi_match that considers both the default locale translation (using snowball analyzer) and the translation in the current language (using language-specific analyzer). If we're not using a supported language then only the first part is applied. """ if analyzer: summary_query_name = ( 'MultiMatch(MatchPhrase(summary),' 'MatchPhrase(summary_l10n_%s))' % analyzer) description_query_name = ( 'MultiMatch(MatchPhrase(description),' 'MatchPhrase(description_l10n_%s))' % analyzer) should = [ query.MultiMatch( _name=summary_query_name, query=search_query, type='phrase', fields=['summary', 'summary_l10n_%s' % analyzer], boost=3.0, ), query.MultiMatch( _name=description_query_name, query=search_query, type='phrase', fields=['description', 'description_l10n_%s' % analyzer], boost=2.0, ), ] else: should = [ query.MatchPhrase(summary={ '_name': 'MatchPhrase(summary)', 'query': search_query, 'boost': 3.0}), query.MatchPhrase(description={ '_name': 'MatchPhrase(description)', 'query': search_query, 'boost': 2.0}), ] return should
def multi_match(field_list: List[str], search_term: str, **kwargs) -> Q.Query: """ Helper method for generating a multi-match query :param field_list: :param search_term: :param kwargs: :return: """ if hasattr(field_list, "__iter__") is False: field_list = [field_list] formatted_field_list = [] for field in field_list: formatted_field_list.append(field) query_dict = { "query": search_term, "fields": formatted_field_list, } for item in kwargs: query_dict[item] = kwargs[item] q = Q.MultiMatch(**query_dict) return q
def multi_match(field_list, search_term, **kwargs): if hasattr(field_list, "__iter__") is False: field_list = [field_list] formatted_field_list = [] for field in field_list: field_name = _get_field_name(field) formatted_field_list.append(field_name) query_dict = { "query": search_term, "fields": formatted_field_list, } for item in kwargs: query_dict[item] = kwargs[item] q = query.MultiMatch(**query_dict) return q
def secondary_should_rules(self, search_query, lang, rescore_mode=False): """Return "secondary" should rules for the query. These are the ones using the weakest boosts, they are applied to fields containing more text: description & summary. Applied rules: * Look for matches inside the summary (boost=3.0) * Look for matches inside the description (boost=2.0). If we're using a supported language, both rules are done through a multi_match that considers both the default locale translation (using snowball analyzer) and the translation in the current language (using language-specific analyzer). If we're not using a supported language then only the first part is applied. If rescore_mode is True, the match applied are match_phrase queries with a slop of 5 instead of a regular match. As those are more expensive they are only done in the 'rescore' part of the query. """ if rescore_mode is False: query_class = query.Match query_kwargs = { 'operator': 'and', } query_class_name = 'Match' multi_match_kwargs = { 'operator': 'and', } else: query_class = query.MatchPhrase query_kwargs = { 'slop': 10, } query_class_name = 'MatchPhrase' multi_match_kwargs = { 'slop': 10, 'type': 'phrase', } analyzer = self.get_locale_analyzer(lang) should = [] for field_name, boost in (('summary', 3.0), ('description', 2.0)): if analyzer: # Like in generate_exact_name_match_query() and # primary_should_rules() above, we want to search in all # languages supported by this analyzer. fields = [field_name] fields.extend( [ '%s_l10n_%s' % (field_name, lang) for lang in amo.SEARCH_ANALYZER_MAP[analyzer] ] ) query_name = 'MultiMatch(%s)' % ', '.join( ['%s(%s)' % (query_class_name, field) for field in fields] ) should.append( # When *not* doing a rescore, we do regular non-phrase # matches with 'operator': 'and' (see query_class and # multi_match_kwargs above). This may seem wrong, the ES # docs warn against this, but this is exactly what we want # here: we want all terms to be present in either of the # fields individually, not some in one and some in another. query.MultiMatch( _name=query_name, query=search_query, fields=fields, boost=boost, **multi_match_kwargs, ) ) else: should.append( query_class( summary=dict( _name='%s(%s)' % (query_class_name, field_name), query=search_query, boost=boost, **query_kwargs, ) ), ) return should
def primary_should_rules(self, search_query, lang): """Return "primary" should rules for the query. These are the ones using the strongest boosts and are only applied to the add-on name. Applied rules: * Exact match on the name, using the right translation if possible (boost=100.0) * Then text matches, using a language specific analyzer if possible (boost=5.0) * Phrase matches that allows swapped terms (boost=8.0) * Then text matches, using the standard text analyzer (boost=6.0) * Then look for the query as a prefix of a name (boost=3.0) """ should = [self.generate_exact_name_match_query(search_query, lang)] # If we are searching with a language that we support, we also try to # do a match against the translated field. If not, we'll do a match # against the name in default locale below. analyzer = self.get_locale_analyzer(lang) if analyzer: # Like in generate_exact_name_match_query() above, we want to # search in all languages supported by this analyzer. fields = [ 'name_l10n_%s' % lang for lang in amo.SEARCH_ANALYZER_MAP[analyzer] ] should.append( query.MultiMatch( **{ '_name': 'MultiMatch(%s)' % ','.join(fields), 'fields': fields, 'query': search_query, 'boost': 5.0, 'analyzer': analyzer, 'operator': 'and', } ) ) # The rest of the rules are applied to 'name', the field containing the # default locale translation only. That field has word delimiter rules # to help find matches, lowercase filter, etc, at the expense of any # language-specific features. should.extend( [ query.MatchPhrase( **{ 'name': { '_name': 'MatchPhrase(name)', 'query': search_query, 'boost': 8.0, 'slop': 1, }, } ), query.Match( **{ 'name': { '_name': 'Match(name)', 'analyzer': 'standard', 'query': search_query, 'boost': 6.0, 'operator': 'and', }, } ), query.Prefix( **{ 'name': { '_name': 'Prefix(name)', 'value': search_query, 'boost': 3.0, }, } ), ] ) # Add two queries inside a single DisMax rule (avoiding overboosting # when an add-on name matches both queries) to support partial & fuzzy # matches (both allowing some words in the query to be absent). # For short query strings only (long strings, depending on what # characters they contain and how many words are present, can be too # costly). # Again applied to 'name' in the default locale, without the # language-specific analysis. if len(search_query) < self.MAX_QUERY_LENGTH_FOR_FUZZY_SEARCH: should.append( query.DisMax( # We only care if one of these matches, so we leave tie_breaker # to the default value of 0.0. _name='DisMax(FuzzyMatch(name), Match(name.trigrams))', boost=4.0, queries=[ # For the fuzzy query, only slight mispellings should be # corrected, but we allow some of the words to be absent # as well: # 1 or 2 terms: should all be present # 3 terms: 2 should be present # 4 terms or more: 25% can be absent { 'match': { 'name': { 'query': search_query, 'prefix_length': 2, 'fuzziness': 'AUTO', 'minimum_should_match': '2<2 3<-25%', } } }, # For the trigrams query, we require at least 66% of the # trigrams to be present. { 'match': { 'name.trigrams': { 'query': search_query, 'minimum_should_match': '66%', } } }, ], ) ) return should
def secondary_should_rules(self, search_query, analyzer, rescore_mode=False): """Return "secondary" should rules for the query. These are the ones using the weakest boosts, they are applied to fields containing more text: description & summary. Applied rules: * Look for matches inside the summary (boost=3.0) * Look for matches inside the description (boost=2.0). If we're using a supported language, both rules are done through a multi_match that considers both the default locale translation (using snowball analyzer) and the translation in the current language (using language-specific analyzer). If we're not using a supported language then only the first part is applied. If rescore_mode is True, the match applied are match_phrase queries with a slop of 5 instead of a regular match. As those are more expensive they are only done in the 'rescore' part of the query. """ if rescore_mode is False: query_class = query.Match query_kwargs = { 'operator': 'and', } query_class_name = 'Match' multi_match_kwargs = { 'operator': 'and', } else: query_class = query.MatchPhrase query_kwargs = { 'slop': 10, } query_class_name = 'MatchPhrase' multi_match_kwargs = { 'slop': 10, 'type': 'phrase', } if analyzer: summary_query_name = ( 'MultiMatch(%s(summary),%s(summary_l10n_%s))' % (query_class_name, query_class_name, analyzer)) description_query_name = ( 'MultiMatch(%s(description),%s(description_l10n_%s))' % (query_class_name, query_class_name, analyzer)) should = [ # When *not* doing a rescore, we do regular non-phrase matches # with 'operator': 'and' (see query_class/multi_match_kwargs # above). This may seem wrong, the ES docs warn against this, # but this is exactly what we want here: we want all terms # to be present in either of the fields individually, not some # in one and some in another. query.MultiMatch( _name=summary_query_name, query=search_query, fields=['summary', 'summary_l10n_%s' % analyzer], boost=3.0, **multi_match_kwargs), query.MultiMatch( _name=description_query_name, query=search_query, fields=['description', 'description_l10n_%s' % analyzer], boost=2.0, **multi_match_kwargs), ] else: should = [ query_class(summary=dict(_name='%s(summary)' % query_class_name, query=search_query, boost=3.0, **query_kwargs)), query_class(summary=dict(_name='%s(description)' % query_class_name, query=search_query, boost=2.0, **query_kwargs)), ] return should
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): """ Perform a search in Elasticsearch. Args: tokens (unicode): string of one or more words repo_slug (unicode): repository slug sort_by (string): field to sort by terms: (dict): {"vocabulary name": ["term1" [, "term2"]]} Returns: results (SearchResults) """ if terms is None: terms = {} search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) # Limit returned fields since content_xml can be huge and is unnecessary. search = search.fields(_get_field_names()) if tokens is not None: # Search on title, description, and content_xml (minus markup). multi = query.MultiMatch( query=tokens, fields=["title", "description", "content_stripped"]) search = search.query(multi) # Filter further on taxonomy terms. for key, value in terms.items(): if value is None: search = search.query("query_string", query="_missing_:({key})".format(key=key)) else: search = search.query("match", **{key: value}) if repo_slug is not None: # Filter further on repository. search = search.query("match", repository=repo_slug) if sort_by is None: # Always sort by ID to preserve ordering. search = search.sort("id") else: # Temporary workaround; the values in sorting.py should be updated, # but for now Haystack is still using them. Also, the hyphen is # required because we sort the numeric values high to low. if "title" not in sort_by: reverse = sort_by.startswith("-") if reverse: sort_by = sort_by[1:] if "xa" not in sort_by: sort_by = "xa_{0}".format(sort_by) if reverse: sort_by = "-{0}".format(sort_by) # Always sort by ID to preserve ordering. search = search.sort(sort_by, "id") vocab_ids = set(get_vocab_ids(repo_slug=repo_slug)) for vocab_id in vocab_ids: vocab_key = make_vocab_key(vocab_id) search.aggs.bucket("{key}_missing".format(key=vocab_key), "missing", field=vocab_key) search.aggs.bucket("{key}_buckets".format(key=vocab_key), "terms", field=vocab_key) for key in ('run', 'course', 'resource_type'): search.aggs.bucket('{key}_builtins'.format(key=key), "terms", field=key) return SearchResults(search)
def search_results(request, lang, version, per_page=10, orphans=3): """ Search view to handle language and version specific queries. The old search view is being redirected here. """ release = get_object_or_404(DocumentRelease, version=version, lang=lang) form = DocSearchForm(request.GET or None, release=release) context = { 'form': form, 'lang': release.lang, 'version': release.version, 'release': release, 'searchparams': request.GET.urlencode(), 'version_is_dev': version == 'dev', 'version_is_unsupported': version_is_unsupported(version), } if form.is_valid(): q = form.cleaned_data.get('q') if q: # catch queries that are coming from browser search bars exact = (DocumentDocType.index_queryset() .filter(release=release, title=q) .first()) if exact is not None: return redirect(exact) should = [] if any(operator in q for operator in SIMPLE_SEARCH_OPERATORS): should.append(query.SimpleQueryString(fields=['title', 'content^5'], query=q, analyzer='stop', default_operator='and')) else: # let's just use simple queries since they allow some # neat syntaxes for exclusion etc. For more info see # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html should = [query.MultiMatch(fields=['title^10', 'content'], query=q, type='phrase_prefix'), query.Match(query=q), query.MultiMatch(fields=['title^5', 'content'], query=q, fuzziness=1)] # then apply the queries and filter out anything not matching # the wanted version and language, also highlight the content # and order the highlighted snippets by score so that the most # fitting result is used results = (DocumentDocType.search() .query(query.Bool(should=should)) .filter('term', release__lang=release.lang) .filter('term', release__version=release.version) .highlight_options(order='score') .highlight('content')) page_number = request.GET.get('page') or 1 paginator = SearchPaginator(results, per_page=per_page, orphans=orphans) try: page_number = int(page_number) except ValueError: if page_number == 'last': page_number = paginator.num_pages else: raise Http404(_("Page is not 'last', " "nor can it be converted to an int.")) try: page = paginator.page(page_number) except InvalidPage as e: raise Http404(_('Invalid page (%(page_number)s): %(message)s') % { 'page_number': page_number, 'message': str(e) }) context.update({ 'query': q, 'page': page, 'paginator': paginator, }) if release.lang != 'en': activate(release.lang) return render(request, 'docs/search_results.html', context)
def _get_games_search_query(q: str) -> esq.Q: return esq.Boosting( positive=esq.MultiMatch(query=q) & esq.Match(isPublic=True), negative=_ALL_SEATS_FILLED_QUERY, negative_boost=0.5, )