コード例 #1
0
def content_query(search_term: str, **kwargs) -> Q.DisMax:
    """
    Returns the default ONS content query

    :param search_term:
    :return:
    """
    q = Q.DisMax(
        queries=[
            Q.Bool(
                should=[
                    match(AvailableFields.TITLE_NO_DATES.value.name, search_term, type="boolean", boost=10.0,
                          minimum_should_match="1<-2 3<80% 5<60%"),
                    match(AvailableFields.TITLE_NO_STEM.value.name, search_term, type="boolean", boost=10.0,
                          minimum_should_match="1<-2 3<80% 5<60%"),
                    multi_match([AvailableFields.TITLE.value.field_name_boosted, AvailableFields.EDITION.value.field_name_boosted], search_term,
                                type="cross_fields", minimum_should_match="3<80% 5<60%")
                ]
            ),
            multi_match([AvailableFields.SUMMARY.value.name, AvailableFields.META_DESCRIPTION.value.name], search_term,
                        type="best_fields", minimum_should_match="75%"),
            match(AvailableFields.KEYWORDS.value.name, search_term, type="boolean", operator="AND"),
            multi_match([AvailableFields.CDID.value.name, AvailableFields.DATASET_ID.value.name], search_term),
            match(AvailableFields.SEARCH_BOOST.value.name, search_term, type="boolean", operator="AND", boost=100.0)
        ],
        **kwargs
    )

    return q
コード例 #2
0
ファイル: filters.py プロジェクト: osamamagdy/addons-server
    def generate_exact_name_match_query(self, search_query, lang):
        """
        Return the query used for exact name matching.

        If the name of the add-on is an exact match for the search query, it's
        likely to be what the user wanted to find. To support that, we need to
        do a term query against a non-analyzed field and boost it super high.
        Since we need to support translations, this function has 2 modes:
        - In the first one, used when we are dealing with a language for which
          we know we didn't store a translation in ES (because we don't have an
          analyzer for it), it only executes a term query against `name.raw`.
        - In the second one, we did store a translation in that language...
          potentially. We don't know in advance if there is a translation for
          each add-on! We need to do a query against both `name.raw` and
          `name_l10n_<analyzer>.raw`, applying the boost only once if both
          match. This is where the DisMax comes in, it's what MultiMatch
          would do, except that it works with Term queries.
        """
        analyzer = self.get_locale_analyzer(lang)
        if analyzer is None:
            clause = query.Term(
                **{
                    'name.raw': {
                        '_name': 'Term(name.raw)',
                        'value': search_query,
                        'boost': 100.0,
                    }
                })
        else:
            queries = [
                {
                    'term': {
                        'name.raw': search_query
                    }
                },
            ]
            # Search in all languages supported by this analyzer. This allows
            # us to return results in a different language that is close to the
            # one requested ('en-ca' when searching in 'en-us' for instance).
            fields = [
                'name_l10n_%s.raw' % lang
                for lang in amo.SEARCH_ANALYZER_MAP[analyzer]
            ]
            queries.extend([{
                'term': {
                    field: search_query
                }
            } for field in fields])
            clause = query.DisMax(
                # Note: We only care if one of these matches, so we leave
                # tie_breaker to the default value of 0.0.
                _name='DisMax(Term(name.raw), %s)' %
                ', '.join(['Term(%s)' % field for field in fields]),
                boost=100.0,
                queries=queries,
            )
        return clause
コード例 #3
0
def test_dismax_to_dict():
    assert {
        "dis_max": {
            "queries": [{
                "term": {
                    "_type": "article"
                }
            }]
        }
    } == query.DisMax(queries=[query.Term(_type='article')]).to_dict()
コード例 #4
0
    def build_first_text_query(self, args):
        vn_text_query = args.get('search_text')
        na_text_query = args.get('q')

        if not vn_text_query:
            return query.MatchAll()

        text_query = query.DisMax(
            queries=[
                self.vietnames_prefix_query(vn_text_query),
                self.non_accented_prefix_query(na_text_query)
            ],
            tie_breaker=0
        )
        return text_query
コード例 #5
0
ファイル: filters.py プロジェクト: yashahmad/addons-server
    def generate_exact_name_match_query(self, search_query, analyzer):
        """
        Return the query used for exact name matching.

        If the name of the add-on is an exact match for the search query, it's
        likely to be what the user wanted to find. To support that, we need to
        do a term query against a non-analyzed field and boost it super high.
        Since we need to support translations, this function has 2 modes:
        - In the first one, used when we are dealing with a language for which
          we know we didn't store a translation in ES (because we don't have an
          analyzer for it), it only executes a term query against `name.raw`.
        - In the second one, we did store a translation in that language...
          potentially. We don't know in advance if there is a translation for
          each add-on! We need to do a query against both `name.raw` and
          `name_l10n_<analyzer>.raw`, applying the boost only once if both
          match. This is where the DisMax comes in, it's what MultiMatch
          would do, except that it works with Term queries.
        """
        if analyzer is None:
            clause = query.Term(
                **{
                    'name.raw': {
                        '_name': 'Term(name.raw)',
                        'value': search_query,
                        'boost': 100.0
                    }
                })
        else:
            query_name = 'DisMax(Term(name.raw), Term(name_l10n_%s.raw))' % (
                analyzer)
            clause = query.DisMax(
                # We only care if one of these matches, so we leave tie_breaker
                # to the default value of 0.0.
                _name=query_name,
                boost=100.0,
                queries=[
                    {
                        'term': {
                            'name.raw': search_query
                        }
                    },
                    {
                        'term': {
                            'name_l10n_%s.raw' % analyzer: search_query
                        }
                    },
                ])
        return clause
コード例 #6
0
    def build_first_text_query(self, args):
        fuzzinees = "AUTO"
        vn_text_query = args.get('search_text')
        na_text_query = args.get('q')

        if not vn_text_query:
            return query.MatchAll()

        text_query = query.DisMax(queries=[
            build_prefix_query("name_no_tone__raw", na_text_query, 1000),
            build_phrase_prefix_query("name", vn_text_query, 200),
            build_phrase_prefix_query("name_no_tone", na_text_query, 100),
            build_match_query("name", vn_text_query, fuzzinees, 5),
            build_match_query("name_no_tone", na_text_query, fuzzinees),
        ],
                                  tie_breaker=0.2)
        return text_query
コード例 #7
0
def content_query(search_term, function_scores=None):
    """
    Returns the default ONS content query
    :param search_term:
    :param function_scores:
    :return:
    """
    q = query.DisMax(queries=[
        query.Bool(should=[
            match(fields.title_no_dates,
                  search_term,
                  type="boolean",
                  boost=10.0,
                  minimum_should_match="1<-2 3<80% 5<60%"),
            match(fields.title_no_stem,
                  search_term,
                  type="boolean",
                  boost=10.0,
                  minimum_should_match="1<-2 3<80% 5<60%"),
            multi_match([
                fields.title.field_name_boosted,
                fields.edition.field_name_boosted
            ],
                        search_term,
                        type="cross_fields",
                        minimum_should_match="3<80% 5<60%")
        ]),
        multi_match([fields.summary.name, fields.metaDescription.name],
                    search_term,
                    type="best_fields",
                    minimum_should_match="75%"),
        match(fields.keywords, search_term, type="boolean", operator="AND"),
        multi_match([fields.cdid.name, fields.datasetId.name], search_term),
        match(fields.searchBoost,
              search_term,
              type="boolean",
              operator="AND",
              boost=100.0)
    ])

    if function_scores is None:
        return q
    else:
        return query.FunctionScore(query=q, functions=function_scores)
コード例 #8
0
    def primary_should_rules(self, search_query, lang):
        """Return "primary" should rules for the query.

        These are the ones using the strongest boosts and are only applied to
        the add-on name.

        Applied rules:

        * Exact match on the name, using the right translation if possible
          (boost=100.0)
        * Then text matches, using a language specific analyzer if possible
          (boost=5.0)
        * Phrase matches that allows swapped terms (boost=8.0)
        * Then text matches, using the standard text analyzer (boost=6.0)
        * Then look for the query as a prefix of a name (boost=3.0)
        """
        should = [self.generate_exact_name_match_query(search_query, lang)]

        # If we are searching with a language that we support, we also try to
        # do a match against the translated field. If not, we'll do a match
        # against the name in default locale below.
        analyzer = self.get_locale_analyzer(lang)
        if analyzer:
            # Like in generate_exact_name_match_query() above, we want to
            # search in all languages supported by this analyzer.
            fields = [
                'name_l10n_%s' % lang for lang in amo.SEARCH_ANALYZER_MAP[analyzer]
            ]
            should.append(
                query.MultiMatch(
                    **{
                        '_name': 'MultiMatch(%s)' % ','.join(fields),
                        'fields': fields,
                        'query': search_query,
                        'boost': 5.0,
                        'analyzer': analyzer,
                        'operator': 'and',
                    }
                )
            )

        # The rest of the rules are applied to 'name', the field containing the
        # default locale translation only. That field has word delimiter rules
        # to help find matches, lowercase filter, etc, at the expense of any
        # language-specific features.
        should.extend(
            [
                query.MatchPhrase(
                    **{
                        'name': {
                            '_name': 'MatchPhrase(name)',
                            'query': search_query,
                            'boost': 8.0,
                            'slop': 1,
                        },
                    }
                ),
                query.Match(
                    **{
                        'name': {
                            '_name': 'Match(name)',
                            'analyzer': 'standard',
                            'query': search_query,
                            'boost': 6.0,
                            'operator': 'and',
                        },
                    }
                ),
                query.Prefix(
                    **{
                        'name': {
                            '_name': 'Prefix(name)',
                            'value': search_query,
                            'boost': 3.0,
                        },
                    }
                ),
            ]
        )

        # Add two queries inside a single DisMax rule (avoiding overboosting
        # when an add-on name matches both queries) to support partial & fuzzy
        # matches (both allowing some words in the query to be absent).
        # For short query strings only (long strings, depending on what
        # characters they contain and how many words are present, can be too
        # costly).
        # Again applied to 'name' in the default locale, without the
        # language-specific analysis.
        if len(search_query) < self.MAX_QUERY_LENGTH_FOR_FUZZY_SEARCH:
            should.append(
                query.DisMax(
                    # We only care if one of these matches, so we leave tie_breaker
                    # to the default value of 0.0.
                    _name='DisMax(FuzzyMatch(name), Match(name.trigrams))',
                    boost=4.0,
                    queries=[
                        # For the fuzzy query, only slight mispellings should be
                        # corrected, but we allow some of the words to be absent
                        # as well:
                        # 1 or 2 terms: should all be present
                        # 3 terms: 2 should be present
                        # 4 terms or more: 25% can be absent
                        {
                            'match': {
                                'name': {
                                    'query': search_query,
                                    'prefix_length': 2,
                                    'fuzziness': 'AUTO',
                                    'minimum_should_match': '2<2 3<-25%',
                                }
                            }
                        },
                        # For the trigrams query, we require at least 66% of the
                        # trigrams to be present.
                        {
                            'match': {
                                'name.trigrams': {
                                    'query': search_query,
                                    'minimum_should_match': '66%',
                                }
                            }
                        },
                    ],
                )
            )

        return should
コード例 #9
0
 def get_must_conditions(self, args):
     conditions = []
     file_id = args.get('file_id')
     if file_id:
         if isinstance(file_id, list):
             conditions.append(query.Terms(file_id=file_id))
         else:
             conditions.append(query.Term(file_id=file_id))
     search_text = args.get('q')
     if file_id and search_text:
         raise BadRequestException("Not support both q and file_id param")
     if search_text:
         conditions.append(
             query.DisMax(queries=[
                 query.MatchPhrasePrefix(file_title={
                     'query': search_text,
                     'boost': 10
                 }),
                 query.MatchPhrasePrefix(file_title__no_tone={
                     'query': search_text,
                     'boost': 10
                 }),
                 query.Match(
                     file_title={
                         'query': search_text,
                         'boost': 4,
                         'operator': 'or',
                         'minimum_should_match': "1<75%"
                     }),
                 query.Match(
                     file_title__no_tone={
                         'query': search_text,
                         'boost': 4,
                         'operator': 'or',
                         'minimum_should_match': "1<75%"
                     }),
                 query.Match(
                     file_tag__text={
                         'query': search_text,
                         'boost': 2,
                         'operator': 'or',
                         'minimum_should_match': "1<75%"
                     }),
                 query.MatchPhrasePrefix(file_tag__text={
                     'query': search_text,
                     'boost': 2
                 }),
                 query.Match(
                     description={
                         'query': search_text,
                         'boost': 1,
                         'operator': 'or',
                         'minimum_should_match': "1<75%"
                     }),
                 query.Match(
                     description__no_tone={
                         'query': search_text,
                         'boost': 1,
                         'operator': 'or',
                         'minimum_should_match': "1<75%"
                     })
             ]))
     if not conditions:
         conditions.append(query.MatchAll())
     return conditions