Ejemplo n.º 1
0
    def secondary_should_rules(self, search_query, analyzer):
        """Return "secondary" should rules for the query.

        These are the ones using the weakest boosts, they are applied to fields
        containing more text: description & summary.

        Applied rules:

        * Look for phrase matches inside the summary (boost=3.0)
        * Look for phrase matches inside the description (boost=2.0).

        If we're using a supported language, both rules are done through a
        multi_match that considers both the default locale translation
        (using snowball analyzer) and the translation in the current language
        (using language-specific analyzer). If we're not using a supported
        language then only the first part is applied.
        """
        if analyzer:
            summary_query_name = (
                'MultiMatch(MatchPhrase(summary),'
                'MatchPhrase(summary_l10n_%s))' % analyzer)
            description_query_name = (
                'MultiMatch(MatchPhrase(description),'
                'MatchPhrase(description_l10n_%s))' % analyzer)
            should = [
                query.MultiMatch(
                    _name=summary_query_name,
                    query=search_query,
                    type='phrase',
                    fields=['summary', 'summary_l10n_%s' % analyzer],
                    boost=3.0,
                ),
                query.MultiMatch(
                    _name=description_query_name,
                    query=search_query,
                    type='phrase',
                    fields=['description', 'description_l10n_%s' % analyzer],
                    boost=2.0,
                ),
            ]
        else:
            should = [
                query.MatchPhrase(summary={
                    '_name': 'MatchPhrase(summary)',
                    'query': search_query, 'boost': 3.0}),
                query.MatchPhrase(description={
                    '_name': 'MatchPhrase(description)',
                    'query': search_query, 'boost': 2.0}),
            ]

        return should
Ejemplo n.º 2
0
def multi_match(field_list: List[str], search_term: str, **kwargs) -> Q.Query:
    """
    Helper method for generating a multi-match query
    :param field_list:
    :param search_term:
    :param kwargs:
    :return:
    """
    if hasattr(field_list, "__iter__") is False:
        field_list = [field_list]

    formatted_field_list = []
    for field in field_list:
        formatted_field_list.append(field)

    query_dict = {
        "query": search_term,
        "fields": formatted_field_list,
    }

    for item in kwargs:
        query_dict[item] = kwargs[item]

    q = Q.MultiMatch(**query_dict)
    return q
Ejemplo n.º 3
0
def multi_match(field_list, search_term, **kwargs):
    if hasattr(field_list, "__iter__") is False:
        field_list = [field_list]

    formatted_field_list = []
    for field in field_list:
        field_name = _get_field_name(field)
        formatted_field_list.append(field_name)

    query_dict = {
        "query": search_term,
        "fields": formatted_field_list,
    }

    for item in kwargs:
        query_dict[item] = kwargs[item]

    q = query.MultiMatch(**query_dict)
    return q
Ejemplo n.º 4
0
    def secondary_should_rules(self, search_query, lang, rescore_mode=False):
        """Return "secondary" should rules for the query.

        These are the ones using the weakest boosts, they are applied to fields
        containing more text: description & summary.

        Applied rules:

        * Look for matches inside the summary (boost=3.0)
        * Look for matches inside the description (boost=2.0).

        If we're using a supported language, both rules are done through a
        multi_match that considers both the default locale translation
        (using snowball analyzer) and the translation in the current language
        (using language-specific analyzer). If we're not using a supported
        language then only the first part is applied.

        If rescore_mode is True, the match applied are match_phrase queries
        with a slop of 5 instead of a regular match. As those are more
        expensive they are only done in the 'rescore' part of the query.
        """
        if rescore_mode is False:
            query_class = query.Match
            query_kwargs = {
                'operator': 'and',
            }
            query_class_name = 'Match'
            multi_match_kwargs = {
                'operator': 'and',
            }
        else:
            query_class = query.MatchPhrase
            query_kwargs = {
                'slop': 10,
            }
            query_class_name = 'MatchPhrase'
            multi_match_kwargs = {
                'slop': 10,
                'type': 'phrase',
            }

        analyzer = self.get_locale_analyzer(lang)
        should = []
        for field_name, boost in (('summary', 3.0), ('description', 2.0)):
            if analyzer:
                # Like in generate_exact_name_match_query() and
                # primary_should_rules() above, we want to search in all
                # languages supported by this analyzer.
                fields = [field_name]
                fields.extend(
                    [
                        '%s_l10n_%s' % (field_name, lang)
                        for lang in amo.SEARCH_ANALYZER_MAP[analyzer]
                    ]
                )
                query_name = 'MultiMatch(%s)' % ', '.join(
                    ['%s(%s)' % (query_class_name, field) for field in fields]
                )
                should.append(
                    # When *not* doing a rescore, we do regular non-phrase
                    # matches with 'operator': 'and' (see query_class and
                    # multi_match_kwargs above). This may seem wrong, the ES
                    # docs warn against this, but this is exactly what we want
                    # here: we want all terms to be present in either of the
                    # fields individually, not some in one and some in another.
                    query.MultiMatch(
                        _name=query_name,
                        query=search_query,
                        fields=fields,
                        boost=boost,
                        **multi_match_kwargs,
                    )
                )
            else:
                should.append(
                    query_class(
                        summary=dict(
                            _name='%s(%s)' % (query_class_name, field_name),
                            query=search_query,
                            boost=boost,
                            **query_kwargs,
                        )
                    ),
                )

        return should
Ejemplo n.º 5
0
    def primary_should_rules(self, search_query, lang):
        """Return "primary" should rules for the query.

        These are the ones using the strongest boosts and are only applied to
        the add-on name.

        Applied rules:

        * Exact match on the name, using the right translation if possible
          (boost=100.0)
        * Then text matches, using a language specific analyzer if possible
          (boost=5.0)
        * Phrase matches that allows swapped terms (boost=8.0)
        * Then text matches, using the standard text analyzer (boost=6.0)
        * Then look for the query as a prefix of a name (boost=3.0)
        """
        should = [self.generate_exact_name_match_query(search_query, lang)]

        # If we are searching with a language that we support, we also try to
        # do a match against the translated field. If not, we'll do a match
        # against the name in default locale below.
        analyzer = self.get_locale_analyzer(lang)
        if analyzer:
            # Like in generate_exact_name_match_query() above, we want to
            # search in all languages supported by this analyzer.
            fields = [
                'name_l10n_%s' % lang for lang in amo.SEARCH_ANALYZER_MAP[analyzer]
            ]
            should.append(
                query.MultiMatch(
                    **{
                        '_name': 'MultiMatch(%s)' % ','.join(fields),
                        'fields': fields,
                        'query': search_query,
                        'boost': 5.0,
                        'analyzer': analyzer,
                        'operator': 'and',
                    }
                )
            )

        # The rest of the rules are applied to 'name', the field containing the
        # default locale translation only. That field has word delimiter rules
        # to help find matches, lowercase filter, etc, at the expense of any
        # language-specific features.
        should.extend(
            [
                query.MatchPhrase(
                    **{
                        'name': {
                            '_name': 'MatchPhrase(name)',
                            'query': search_query,
                            'boost': 8.0,
                            'slop': 1,
                        },
                    }
                ),
                query.Match(
                    **{
                        'name': {
                            '_name': 'Match(name)',
                            'analyzer': 'standard',
                            'query': search_query,
                            'boost': 6.0,
                            'operator': 'and',
                        },
                    }
                ),
                query.Prefix(
                    **{
                        'name': {
                            '_name': 'Prefix(name)',
                            'value': search_query,
                            'boost': 3.0,
                        },
                    }
                ),
            ]
        )

        # Add two queries inside a single DisMax rule (avoiding overboosting
        # when an add-on name matches both queries) to support partial & fuzzy
        # matches (both allowing some words in the query to be absent).
        # For short query strings only (long strings, depending on what
        # characters they contain and how many words are present, can be too
        # costly).
        # Again applied to 'name' in the default locale, without the
        # language-specific analysis.
        if len(search_query) < self.MAX_QUERY_LENGTH_FOR_FUZZY_SEARCH:
            should.append(
                query.DisMax(
                    # We only care if one of these matches, so we leave tie_breaker
                    # to the default value of 0.0.
                    _name='DisMax(FuzzyMatch(name), Match(name.trigrams))',
                    boost=4.0,
                    queries=[
                        # For the fuzzy query, only slight mispellings should be
                        # corrected, but we allow some of the words to be absent
                        # as well:
                        # 1 or 2 terms: should all be present
                        # 3 terms: 2 should be present
                        # 4 terms or more: 25% can be absent
                        {
                            'match': {
                                'name': {
                                    'query': search_query,
                                    'prefix_length': 2,
                                    'fuzziness': 'AUTO',
                                    'minimum_should_match': '2<2 3<-25%',
                                }
                            }
                        },
                        # For the trigrams query, we require at least 66% of the
                        # trigrams to be present.
                        {
                            'match': {
                                'name.trigrams': {
                                    'query': search_query,
                                    'minimum_should_match': '66%',
                                }
                            }
                        },
                    ],
                )
            )

        return should
Ejemplo n.º 6
0
    def secondary_should_rules(self,
                               search_query,
                               analyzer,
                               rescore_mode=False):
        """Return "secondary" should rules for the query.

        These are the ones using the weakest boosts, they are applied to fields
        containing more text: description & summary.

        Applied rules:

        * Look for matches inside the summary (boost=3.0)
        * Look for matches inside the description (boost=2.0).

        If we're using a supported language, both rules are done through a
        multi_match that considers both the default locale translation
        (using snowball analyzer) and the translation in the current language
        (using language-specific analyzer). If we're not using a supported
        language then only the first part is applied.

        If rescore_mode is True, the match applied are match_phrase queries
        with a slop of 5 instead of a regular match. As those are more
        expensive they are only done in the 'rescore' part of the query.
        """
        if rescore_mode is False:
            query_class = query.Match
            query_kwargs = {
                'operator': 'and',
            }
            query_class_name = 'Match'
            multi_match_kwargs = {
                'operator': 'and',
            }
        else:
            query_class = query.MatchPhrase
            query_kwargs = {
                'slop': 10,
            }
            query_class_name = 'MatchPhrase'
            multi_match_kwargs = {
                'slop': 10,
                'type': 'phrase',
            }

        if analyzer:
            summary_query_name = (
                'MultiMatch(%s(summary),%s(summary_l10n_%s))' %
                (query_class_name, query_class_name, analyzer))
            description_query_name = (
                'MultiMatch(%s(description),%s(description_l10n_%s))' %
                (query_class_name, query_class_name, analyzer))
            should = [
                # When *not* doing a rescore, we do regular non-phrase matches
                # with 'operator': 'and' (see query_class/multi_match_kwargs
                # above). This may seem wrong, the ES docs warn against this,
                # but this is exactly what we want here: we want all terms
                # to be present in either of the fields individually, not some
                # in one and some in another.
                query.MultiMatch(
                    _name=summary_query_name,
                    query=search_query,
                    fields=['summary', 'summary_l10n_%s' % analyzer],
                    boost=3.0,
                    **multi_match_kwargs),
                query.MultiMatch(
                    _name=description_query_name,
                    query=search_query,
                    fields=['description',
                            'description_l10n_%s' % analyzer],
                    boost=2.0,
                    **multi_match_kwargs),
            ]
        else:
            should = [
                query_class(summary=dict(_name='%s(summary)' %
                                         query_class_name,
                                         query=search_query,
                                         boost=3.0,
                                         **query_kwargs)),
                query_class(summary=dict(_name='%s(description)' %
                                         query_class_name,
                                         query=search_query,
                                         boost=2.0,
                                         **query_kwargs)),
            ]

        return should
Ejemplo n.º 7
0
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None):
    """
    Perform a search in Elasticsearch.

    Args:
        tokens (unicode): string of one or more words
        repo_slug (unicode): repository slug
        sort_by (string): field to sort by
        terms: (dict): {"vocabulary name": ["term1" [, "term2"]]}
    Returns:
        results (SearchResults)
    """
    if terms is None:
        terms = {}

    search = Search(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Limit returned fields since content_xml can be huge and is unnecessary.
    search = search.fields(_get_field_names())

    if tokens is not None:
        # Search on title, description, and content_xml (minus markup).
        multi = query.MultiMatch(
            query=tokens, fields=["title", "description", "content_stripped"])
        search = search.query(multi)

    # Filter further on taxonomy terms.
    for key, value in terms.items():
        if value is None:
            search = search.query("query_string",
                                  query="_missing_:({key})".format(key=key))
        else:
            search = search.query("match", **{key: value})

    if repo_slug is not None:
        # Filter further on repository.
        search = search.query("match", repository=repo_slug)
    if sort_by is None:
        # Always sort by ID to preserve ordering.
        search = search.sort("id")
    else:
        # Temporary workaround; the values in sorting.py should be updated,
        # but for now Haystack is still using them. Also, the hyphen is
        # required because we sort the numeric values high to low.
        if "title" not in sort_by:
            reverse = sort_by.startswith("-")
            if reverse:
                sort_by = sort_by[1:]
            if "xa" not in sort_by:
                sort_by = "xa_{0}".format(sort_by)
            if reverse:
                sort_by = "-{0}".format(sort_by)
        # Always sort by ID to preserve ordering.
        search = search.sort(sort_by, "id")

    vocab_ids = set(get_vocab_ids(repo_slug=repo_slug))
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        search.aggs.bucket("{key}_missing".format(key=vocab_key),
                           "missing",
                           field=vocab_key)
        search.aggs.bucket("{key}_buckets".format(key=vocab_key),
                           "terms",
                           field=vocab_key)
    for key in ('run', 'course', 'resource_type'):
        search.aggs.bucket('{key}_builtins'.format(key=key),
                           "terms",
                           field=key)

    return SearchResults(search)
Ejemplo n.º 8
0
def search_results(request, lang, version, per_page=10, orphans=3):
    """
    Search view to handle language and version specific queries.
    The old search view is being redirected here.
    """
    release = get_object_or_404(DocumentRelease, version=version, lang=lang)
    form = DocSearchForm(request.GET or None, release=release)

    context = {
        'form': form,
        'lang': release.lang,
        'version': release.version,
        'release': release,
        'searchparams': request.GET.urlencode(),
        'version_is_dev': version == 'dev',
        'version_is_unsupported': version_is_unsupported(version),
    }

    if form.is_valid():
        q = form.cleaned_data.get('q')

        if q:
            # catch queries that are coming from browser search bars
            exact = (DocumentDocType.index_queryset()
                                    .filter(release=release, title=q)
                                    .first())
            if exact is not None:
                return redirect(exact)

            should = []
            if any(operator in q for operator in SIMPLE_SEARCH_OPERATORS):
                should.append(query.SimpleQueryString(fields=['title',
                                                              'content^5'],
                                                      query=q,
                                                      analyzer='stop',
                                                      default_operator='and'))
            else:
                # let's just use simple queries since they allow some
                # neat syntaxes for exclusion etc. For more info see
                # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html
                should = [query.MultiMatch(fields=['title^10', 'content'],
                                           query=q,
                                           type='phrase_prefix'),
                          query.Match(query=q),
                          query.MultiMatch(fields=['title^5', 'content'],
                                           query=q,
                                           fuzziness=1)]

            # then apply the queries and filter out anything not matching
            # the wanted version and language, also highlight the content
            # and order the highlighted snippets by score so that the most
            # fitting result is used
            results = (DocumentDocType.search()
                                      .query(query.Bool(should=should))
                                      .filter('term', release__lang=release.lang)
                                      .filter('term', release__version=release.version)
                                      .highlight_options(order='score')
                                      .highlight('content'))

            page_number = request.GET.get('page') or 1
            paginator = SearchPaginator(results, per_page=per_page, orphans=orphans)

            try:
                page_number = int(page_number)
            except ValueError:
                if page_number == 'last':
                    page_number = paginator.num_pages
                else:
                    raise Http404(_("Page is not 'last', "
                                    "nor can it be converted to an int."))

            try:
                page = paginator.page(page_number)
            except InvalidPage as e:
                raise Http404(_('Invalid page (%(page_number)s): %(message)s') % {
                    'page_number': page_number,
                    'message': str(e)
                })

            context.update({
                'query': q,
                'page': page,
                'paginator': paginator,
            })

    if release.lang != 'en':
        activate(release.lang)

    return render(request, 'docs/search_results.html', context)
Ejemplo n.º 9
0
def _get_games_search_query(q: str) -> esq.Q:
    return esq.Boosting(
        positive=esq.MultiMatch(query=q) & esq.Match(isPublic=True),
        negative=_ALL_SEATS_FILLED_QUERY,
        negative_boost=0.5,
    )