Ejemplo n.º 1
0
    def secondary_should_rules(self, search_query, analyzer):
        """Return "secondary" should rules for the query.

        These are the ones using the weakest boosts, they are applied to fields
        containing more text like description & summary.

        Applied rules:

        * Look for phrase matches inside the summary (boost=2.0)
        * Look for phrase matches inside the summary using language specific
          analyzer (boost=3.0)
        * Look for phrase matches inside the description (boost=2.0).
        * Look for phrase matches inside the description using language
          specific analyzer (boost=3.0).
        """
        should = [
            query.MatchPhrase(summary={'query': search_query, 'boost': 2.0}),
            query.MatchPhrase(description={
                'query': search_query, 'boost': 2.0}),
        ]

        # For description and summary, also search in translated field with the
        # right language and analyzer.
        if analyzer:
            should.extend([
                query.MatchPhrase(**{'summary_l10n_%s' % analyzer: {
                    'query': search_query, 'boost': 3.0,
                    'analyzer': analyzer}}),
                query.MatchPhrase(**{'description_l10n_%s' % analyzer: {
                    'query': search_query, 'boost': 3.0,
                    'analyzer': analyzer}})
            ])

        return should
Ejemplo n.º 2
0
    def secondary_should_rules(self, search_query, analyzer):
        """Return "secondary" should rules for the query.

        These are the ones using the weakest boosts, they are applied to fields
        containing more text like description, summary and tags.

        Applied rules:

        * Look for phrase matches inside the summary (boost=0.8)
        * Look for phrase matches inside the summary using language specific
          analyzer (boost=0.6)
        * Look for phrase matches inside the description (boost=0.3).
        * Look for phrase matches inside the description using language
          specific analyzer (boost=0.1).
        * Look for matches inside tags (boost=0.1).
        """
        should = [
            query.MatchPhrase(summary={
                'query': search_query,
                'boost': 0.8
            }),
            query.MatchPhrase(description={
                'query': search_query,
                'boost': 0.3
            }),
        ]

        # Append a separate 'match' query for every word to boost tag matches
        for tag in search_query.split():
            should.append(query.Match(tags={'query': tag, 'boost': 0.1}))

        # For description and summary, also search in translated field with the
        # right language and analyzer.
        if analyzer:
            should.extend([
                query.MatchPhrase(
                    **{
                        'summary_l10n_%s' % analyzer: {
                            'query': search_query,
                            'boost': 0.6,
                            'analyzer': analyzer
                        }
                    }),
                query.MatchPhrase(
                    **{
                        'description_l10n_%s' % analyzer: {
                            'query': search_query,
                            'boost': 0.6,
                            'analyzer': analyzer
                        }
                    })
            ])

        return should
Ejemplo n.º 3
0
    def secondary_should_rules(self, search_query, analyzer):
        """Return "secondary" should rules for the query.

        These are the ones using the weakest boosts, they are applied to fields
        containing more text: description & summary.

        Applied rules:

        * Look for phrase matches inside the summary (boost=3.0)
        * Look for phrase matches inside the description (boost=2.0).

        If we're using a supported language, both rules are done through a
        multi_match that considers both the default locale translation
        (using snowball analyzer) and the translation in the current language
        (using language-specific analyzer). If we're not using a supported
        language then only the first part is applied.
        """
        if analyzer:
            summary_query_name = (
                'MultiMatch(MatchPhrase(summary),'
                'MatchPhrase(summary_l10n_%s))' % analyzer)
            description_query_name = (
                'MultiMatch(MatchPhrase(description),'
                'MatchPhrase(description_l10n_%s))' % analyzer)
            should = [
                query.MultiMatch(
                    _name=summary_query_name,
                    query=search_query,
                    type='phrase',
                    fields=['summary', 'summary_l10n_%s' % analyzer],
                    boost=3.0,
                ),
                query.MultiMatch(
                    _name=description_query_name,
                    query=search_query,
                    type='phrase',
                    fields=['description', 'description_l10n_%s' % analyzer],
                    boost=2.0,
                ),
            ]
        else:
            should = [
                query.MatchPhrase(summary={
                    '_name': 'MatchPhrase(summary)',
                    'query': search_query, 'boost': 3.0}),
                query.MatchPhrase(description={
                    '_name': 'MatchPhrase(description)',
                    'query': search_query, 'boost': 2.0}),
            ]

        return should
Ejemplo n.º 4
0
    def primary_should_rules(self, search_query, lang):
        """Return "primary" should rules for the query.

        These are the ones using the strongest boosts and are only applied to
        the add-on name.

        Applied rules:

        * Exact match on the name, using the right translation if possible
          (boost=100.0)
        * Then text matches, using a language specific analyzer if possible
          (boost=5.0)
        * Phrase matches that allows swapped terms (boost=8.0)
        * Then text matches, using the standard text analyzer (boost=6.0)
        * Then look for the query as a prefix of a name (boost=3.0)
        """
        should = [self.generate_exact_name_match_query(search_query, lang)]

        # If we are searching with a language that we support, we also try to
        # do a match against the translated field. If not, we'll do a match
        # against the name in default locale below.
        analyzer = self.get_locale_analyzer(lang)
        if analyzer:
            # Like in generate_exact_name_match_query() above, we want to
            # search in all languages supported by this analyzer.
            fields = [
                'name_l10n_%s' % lang for lang in amo.SEARCH_ANALYZER_MAP[analyzer]
            ]
            should.append(
                query.MultiMatch(
                    **{
                        '_name': 'MultiMatch(%s)' % ','.join(fields),
                        'fields': fields,
                        'query': search_query,
                        'boost': 5.0,
                        'analyzer': analyzer,
                        'operator': 'and',
                    }
                )
            )

        # The rest of the rules are applied to 'name', the field containing the
        # default locale translation only. That field has word delimiter rules
        # to help find matches, lowercase filter, etc, at the expense of any
        # language-specific features.
        should.extend(
            [
                query.MatchPhrase(
                    **{
                        'name': {
                            '_name': 'MatchPhrase(name)',
                            'query': search_query,
                            'boost': 8.0,
                            'slop': 1,
                        },
                    }
                ),
                query.Match(
                    **{
                        'name': {
                            '_name': 'Match(name)',
                            'analyzer': 'standard',
                            'query': search_query,
                            'boost': 6.0,
                            'operator': 'and',
                        },
                    }
                ),
                query.Prefix(
                    **{
                        'name': {
                            '_name': 'Prefix(name)',
                            'value': search_query,
                            'boost': 3.0,
                        },
                    }
                ),
            ]
        )

        # Add two queries inside a single DisMax rule (avoiding overboosting
        # when an add-on name matches both queries) to support partial & fuzzy
        # matches (both allowing some words in the query to be absent).
        # For short query strings only (long strings, depending on what
        # characters they contain and how many words are present, can be too
        # costly).
        # Again applied to 'name' in the default locale, without the
        # language-specific analysis.
        if len(search_query) < self.MAX_QUERY_LENGTH_FOR_FUZZY_SEARCH:
            should.append(
                query.DisMax(
                    # We only care if one of these matches, so we leave tie_breaker
                    # to the default value of 0.0.
                    _name='DisMax(FuzzyMatch(name), Match(name.trigrams))',
                    boost=4.0,
                    queries=[
                        # For the fuzzy query, only slight mispellings should be
                        # corrected, but we allow some of the words to be absent
                        # as well:
                        # 1 or 2 terms: should all be present
                        # 3 terms: 2 should be present
                        # 4 terms or more: 25% can be absent
                        {
                            'match': {
                                'name': {
                                    'query': search_query,
                                    'prefix_length': 2,
                                    'fuzziness': 'AUTO',
                                    'minimum_should_match': '2<2 3<-25%',
                                }
                            }
                        },
                        # For the trigrams query, we require at least 66% of the
                        # trigrams to be present.
                        {
                            'match': {
                                'name.trigrams': {
                                    'query': search_query,
                                    'minimum_should_match': '66%',
                                }
                            }
                        },
                    ],
                )
            )

        return should