Beispiel #1
0
def test_random_score():
    """
    random_score 函数,它的输出是一个介于0到1之间的数字,当给它提供相同的seed值时,它能够产生一致性随机的结果
    random_score 子句不包含任何的filter,因此它适用于所有文档。
                 当然,如果你索引了能匹配查询的新文档,无论你是否使用了一致性随机,结果的顺序都会有所改变。
    :return:
    """
    q = query.Q(
        'function_score',
        functions=[
            query.SF('random_score', seed=10),
            query.SF('field_value_factor',
                     field='likes',
                     modifier="log1p",
                     factor=0.1)
        ],
        score_mode="sum",
        max_boost=1.5

        # 通过制定max_boost参数来限制函数的最大影响
        # 无论field_value_factor函数的结果是多少,它绝不会大于1.5。
        # max_boost只是对函数的结果有所限制,并不是最终的_score。
    )
    s = House.search()
    s = s.query(q)
    print(s.to_dict())
    response = s.execute()
    for h in response:
        print(h.city, h.location)
Beispiel #2
0
    def apply_search_query(self, search_query, qs):
        lang = translation.get_language()
        analyzer = get_locale_analyzer(lang)

        # Our query consist of a number of should clauses. We call the ones
        # with the higher boost "primary" for convenience.
        primary_should = self.primary_should_rules(search_query, analyzer)
        secondary_should = self.secondary_should_rules(search_query, analyzer)

        # We alter scoring depending on the "boost" field which is defined in
        # the mapping (used to boost public addons higher than the rest) and,
        # if the waffle switch is on, whether or an addon is a webextension.
        functions = [
            query.SF('field_value_factor', field='boost'),
        ]
        if waffle.switch_is_active('boost-webextensions-in-search'):
            webext_boost_filter = (Q(
                'term', **{'current_version.files.is_webextension': True}) | Q(
                    'term', **{
                        'current_version.files.is_mozilla_signed_extension':
                        True
                    }))

            functions.append(
                query.SF({
                    'weight': WEBEXTENSIONS_WEIGHT,
                    'filter': webext_boost_filter
                }))

        # Assemble everything together and return the search "queryset".
        return qs.query('function_score',
                        query=query.Bool(should=primary_should +
                                         secondary_should),
                        functions=functions)
Beispiel #3
0
def test_function_score_to_dict():
    q = query.Q('function_score',
                query=query.Q('match', title='python'),
                functions=[
                    query.SF('random_score'),
                    query.SF('field_value_factor',
                             field='comment_count',
                             filter=query.Q('term', tags='python'))
                ])

    d = {
        'function_score': {
            'query': {
                'match': {
                    'title': 'python'
                }
            },
            'functions': [{
                'random_score': {}
            }, {
                'filter': {
                    'term': {
                        'tags': 'python'
                    }
                },
                'field_value_factor': {
                    'field': 'comment_count',
                }
            }],
        }
    }
    assert d == q.to_dict()
def test_function_score_to_dict():
    q = query.Q(
        "function_score",
        query=query.Q("match", title="python"),
        functions=[
            query.SF("random_score"),
            query.SF(
                "field_value_factor",
                field="comment_count",
                filter=query.Q("term", tags="python"),
            ),
        ],
    )

    d = {
        "function_score": {
            "query": {"match": {"title": "python"}},
            "functions": [
                {"random_score": {}},
                {
                    "filter": {"term": {"tags": "python"}},
                    "field_value_factor": {"field": "comment_count"},
                },
            ],
        }
    }
    assert d == q.to_dict()
Beispiel #5
0
    def apply_search_query(self, search_query, qs, sort=None):
        lang = translation.get_language()

        # Our query consist of a number of should clauses. We call the ones
        # with the higher boost "primary" for convenience.
        primary_should = self.primary_should_rules(search_query, lang)
        secondary_should = self.secondary_should_rules(search_query, lang)

        # We alter scoring depending on add-on popularity and whether the
        # add-on is reviewed & public & non-experimental, and whether or not
        # it's in a promoted group with a search boost.
        functions = [
            query.SF('field_value_factor',
                     field='average_daily_users',
                     modifier='log2p'),
            query.SF({
                'weight':
                4.0,
                'filter': (Q('term', is_experimental=False)
                           & Q('terms', status=amo.REVIEWED_STATUSES)
                           & Q('exists', field='current_version')
                           & Q('term', is_disabled=False))
            }),
        ]
        ranking_bump_groups = amo.utils.sorted_groupby(
            PROMOTED_GROUPS, lambda g: g.search_ranking_bump, reverse=True)
        for bump, promo_ids in ranking_bump_groups:
            if not bump:
                continue
            functions.append(
                query.SF({
                    'weight':
                    bump,
                    'filter':
                    (Q('terms',
                       **{'promoted.group_id': [p.id for p in promo_ids]}))
                }))

        # Assemble everything together
        qs = qs.query('function_score',
                      query=query.Bool(should=primary_should +
                                       secondary_should),
                      functions=functions)

        if sort is None or sort == 'relevance':
            # If we are searching by relevancy, rescore the top 10
            # (window_size below) results per shard with more expensive rules
            # using match_phrase + slop.
            rescore_query = self.rescore_rules(search_query, lang)
            qs = qs.extra(
                rescore={
                    'window_size': 10,
                    'query': {
                        'rescore_query': query.Bool(
                            should=rescore_query).to_dict()
                    }
                })

        return qs
Beispiel #6
0
    def apply_search_query(self, search_query, qs, sort=None):
        lang = translation.get_language()
        analyzer = get_locale_analyzer(lang)

        # Our query consist of a number of should clauses. We call the ones
        # with the higher boost "primary" for convenience.
        primary_should = self.primary_should_rules(search_query, analyzer)
        secondary_should = self.secondary_should_rules(search_query, analyzer)

        # We alter scoring depending on add-on popularity and whether the
        # add-on is reviewed & public & non-experimental.
        functions = [
            query.SF('field_value_factor',
                     field='average_daily_users',
                     modifier='log2p'),
            query.SF({
                'weight':
                4.0,
                'filter': (Q('term', is_experimental=False)
                           & Q('terms', status=amo.REVIEWED_STATUSES)
                           & Q('exists', field='current_version')
                           & Q('term', is_disabled=False))
            }),
        ]
        if switch_is_active('api-recommendations-priority'):
            functions.append(
                query.SF({
                    'weight': 5.0,
                    'filter': (Q('term', is_recommended=True))
                }))

        # Assemble everything together
        qs = qs.query('function_score',
                      query=query.Bool(should=primary_should +
                                       secondary_should),
                      functions=functions)

        if sort is None or sort == 'relevance':
            # If we are searching by relevancy, rescore the top 10
            # (window_size below) results per shard with more expensive rules
            # using match_phrase + slop.
            rescore_query = self.rescore_rules(search_query, analyzer)
            qs = qs.extra(
                rescore={
                    'window_size': 10,
                    'query': {
                        'rescore_query': query.Bool(
                            should=rescore_query).to_dict()
                    }
                })

        return qs
Beispiel #7
0
    def filter_queryset(self, request, qs, view):
        search_query = request.GET.get('q', '').lower()

        if not search_query:
            return qs

        lang = translation.get_language()
        analyzer = get_locale_analyzer(lang)

        # Our query consist of a number of should clauses. We call the ones
        # with the higher boost "primary" for convenience.
        primary_should = self.primary_should_rules(search_query, analyzer)
        secondary_should = self.secondary_should_rules(search_query, analyzer)

        # We alter scoring depending on the "boost" field which is defined in
        # the mapping (used to boost public addons higher than the rest).
        functions = [
            query.SF('field_value_factor', field='boost'),
        ]

        # Assemble everything together and return the search "queryset".
        return qs.query('function_score',
                        query=query.Bool(should=primary_should +
                                         secondary_should),
                        functions=functions)
Beispiel #8
0
def test_function_score_exp():
    """
    origin: 中心点 或字段可能的最佳值, 落在原点 origin 上的文档评分 _score 为满分1.0 。
    scale:  衰减率, 即一个文档从原点origin下落时, 评分_score 改变的速度(例如,每 £10 欧元或每 100 米)。
    decay:  从原点 origin 衰减到 scale 所得的评分 _score, 默认值为 0.5。
    offset: 以原点 origin 为中心点,为其设置一个非零的偏移量 offset 覆盖一个范围,而不只是单个原点。
            在范围 -offset <= origin <= +offset 内的所有评分 _score 都是 1.0。
    :return:
    """
    q = query.Q('function_score',
                functions=[
                    query.SF('exp',
                             created_at={
                                 'origin': '2015-03-01',
                                 'scale': '10d',
                                 'offset': '0d',
                                 'decay': 0.5
                             })
                ])
    print(q.to_dict())
    s = House.search()
    s = s.query(q)
    response = s.execute()
    for h in response:
        print(h.city, h.created_at)
Beispiel #9
0
 def filter_queryset(self, qs):
     qs = super().filter_queryset(qs)
     qs = qs.query(query.Bool(filter=[Q('term', is_recommended=True)]))
     return (
         qs.query('function_score', functions=[query.SF('random_score')])
           .sort('_score')
     )
def test_function_score_with_no_function_is_boost_factor():
    q = query.Q(
        "function_score",
        functions=[query.SF({"weight": 20, "filter": query.Q("term", f=42)})],
    )

    assert {
        "function_score": {"functions": [{"filter": {"term": {"f": 42}}, "weight": 20}]}
    } == q.to_dict()
Beispiel #11
0
    def filter_queryset(self, request, qs, view):
        search_query_param = request.GET.get('q')
        sort_param = request.GET.get('sort')
        order_by = None

        if sort_param is not None:
            split_sort_params = sort_param.split(',')

            # Random sort is a bit special.
            # First, it can't be combined with other sorts.
            if 'random' in split_sort_params and len(split_sort_params) > 1:
                raise serializers.ValidationError(
                    'The "random" "sort" parameter can not be combined.')

            # Second, for perf reasons it's only available when the 'featured'
            # or 'recommended' param is present (to limit the number of
            # documents we'll have to apply the random score to) and a search
            # query is absent (to prevent clashing with the score functions
            # coming from a search query).
            if sort_param == 'random':

                is_random_sort_available = (
                    (AddonFeaturedQueryParam.query_param in request.GET
                     or AddonRecommendedQueryParam.query_param in request.GET)
                    and not search_query_param)
                if is_random_sort_available:
                    qs = qs.query('function_score',
                                  functions=[query.SF('random_score')])
                else:
                    raise serializers.ValidationError(
                        'The "sort" parameter "random" can only be specified '
                        'when the "featured" or "recommended" parameter is '
                        'also present, and the "q" parameter absent.')

            # Having just recommended sort doesn't make any sense, so ignore it
            if sort_param == 'recommended':
                sort_param = None

        if sort_param is None:
            # The default sort depends on the presence of a query: we sort by
            # relevance if we have a query, otherwise by recommended,downloads.
            recommended_waffle_on = switch_is_active(
                'api-recommendations-priority')
            split_sort_params = (['relevance'] if search_query_param else
                                 ['recommended', 'downloads']
                                 if recommended_waffle_on else ['downloads'])

        try:
            order_by = [
                self.SORTING_PARAMS[name] for name in split_sort_params
            ]
        except KeyError:
            raise serializers.ValidationError('Invalid "sort" parameter.')

        return qs.sort(*order_by)
def test_function_score_with_functions():
    q = query.Q(
        "function_score",
        functions=[query.SF("script_score", script="doc['comment_count'] * _score")],
    )

    assert {
        "function_score": {
            "functions": [{"script_score": {"script": "doc['comment_count'] * _score"}}]
        }
    } == q.to_dict()
Beispiel #13
0
def get_subscribers(targetings, hours_whitelist, volume):
    logger.debug("get_subscribers: getting subscribers")
    start_time = time.time()
    timezones = [tz
                 for tz in pytz.all_timezones
                 if datetime.now(pytz.timezone(tz)).hour in hours_whitelist]

    targetings.append({
        "field": "unsub",
        "operator": "NOT IN",
        "values": [1, "true"]
    })
    if timezones:
        targetings.append({
            "field": "timezone",
            "operator": "IN",
            "values": timezones
        })
    es_search = Search(using=es, index="users")
    operator_mappings = {
        'IN': 'must',
        'NOT IN': 'must_not',
    }

    es_query = Q()
    for condition in targetings:
        condition_pair = {condition["field"]: condition["values"]}
        terms_q = Q('terms', **condition_pair)
        bool_operator = operator_mappings[condition['operator']]
        bool_q = Q('bool', **{bool_operator: terms_q})
        es_query += bool_q
    es_search = es_search.query(es_query)
    es_search.query = dslq.FunctionScore(
        query=es_search.query,
        functions=[dslq.SF('random_score')],
        boost_mode="replace"
        )
    es_search = es_search[:volume]
    try:
        res = es_search.execute()
    except ElasticsearchException as e:
        logger.error(f"get_subscribers: Exception {e}")
    else:
        subscribers = []
        for row in res.hits:
            subscriber = row.to_dict()
            subscriber['_id'] = row.meta.id
            subscribers.append(subscriber)
        end_time = time.time()
        logger.debug(f"get_subscribers: finished in "
                     f"{int((end_time - start_time) * 1000)}ms")
        return subscribers
Beispiel #14
0
def test_function_score_gauss():
    q = query.Q('function_score',
                query=query.Q('match', city='Sarasota'),
                functions=[
                    query.SF('gauss', price={
                        'origin': '0',
                        'scale': '20'
                    }),
                    query.SF('gauss',
                             location={
                                 'origin': '26.494627, -81.961609',
                                 'scale': '2km',
                                 'offset': '0km',
                                 'decay': 0.33
                             })
                ],
                score_mode="multiply")
    s = House.search()
    s = s.query(q)
    print(s.to_dict())
    response = s.execute()
    for h in response:
        print(h.city, h.location)
Beispiel #15
0
    def search(self, page_id='next_prediction'):
        if self.awsauth is not None:
            connections.create_connection(
                hosts=self.hosts,
                http_auth=self.awsauth,
                use_ssl=True,
                verify_certs=True,
                connection_class=RequestsHttpConnection)
        else:
            connections.create_connection(hosts=self.hosts)

#        q = Q('match', dataset_id='documents')
        if page_id == "next_prediction":
            q = Q('function_score', functions=[query.SF("random_score")])
        else:
            q = Q('match', _id=page_id)

        s = Search(index='page').query(q)[0]
        logger.info("About to execute")
        resp = s.execute().to_dict()
        hit = resp['hits']['hits'][0]['_source']
        objects = zip(hit['bbox'], hit['postprocess_cls'],
                      hit['postprocess_score'])
        pp_detected_objs = []
        for i in objects:
            pp_detected_objs.append({
                "bounding_box": i[0],
                "class": i[1],
                "confidence": i[2],
                "annotated_class": None,
                "obj_id": -1
            })

        image_dir = '/cosmos_tmp/images'
        with open(os.path.join(image_dir, os.path.basename(hit['img_pth'])),
                  'rb') as imf:
            imbytes = base64.b64encode(imf.read()).decode('ascii')

        t = {
            "_id": resp['hits']['hits'][0]['_id'],
            "page_height": hit['pdf_dims'][3],
            "page_width": hit['pdf_dims'][2],
            "pdf_id": -1,
            "pdf_name": hit['pdf_name'],
            "page_num": hit["page_num"],
            "pp_detected_objs": pp_detected_objs,
            "resize_bytes": imbytes
        }
        return [t]
Beispiel #16
0
def test_function_score_with_functions():
    q = query.Q('function_score',
                functions=[
                    query.SF('script_score',
                             script="doc['comment_count'] * _score")
                ])

    assert {
        'function_score': {
            'functions': [{
                'script_score': {
                    'script': "doc['comment_count'] * _score"
                }
            }]
        }
    } == q.to_dict()
Beispiel #17
0
    def filter_queryset(self, request, qs, view):
        search_query_param = request.GET.get('q')
        sort_param = request.GET.get('sort')
        order_by = None

        if sort_param is not None:
            split_sort_params = sort_param.split(',')
            try:
                order_by = [
                    self.SORTING_PARAMS[name] for name in split_sort_params
                ]
            except KeyError:
                raise serializers.ValidationError('Invalid "sort" parameter.')

            # Random sort is a bit special.
            # First, it can't be combined with other sorts.
            if 'random' in split_sort_params and len(split_sort_params) > 1:
                raise serializers.ValidationError(
                    'The "random" "sort" parameter can not be combined.')

            # Second, for perf reasons it's only available when the 'featured'
            # param is present (to limit the number of documents we'll have to
            # apply the random score to) and a search query is absent
            # (to prevent clashing with the score functions coming from a
            # search query).
            if sort_param == 'random':
                is_random_sort_available = (AddonFeaturedQueryParam.query_param
                                            in request.GET
                                            and not search_query_param)
                if is_random_sort_available:
                    qs = qs.query('function_score',
                                  functions=[query.SF('random_score')])
                else:
                    raise serializers.ValidationError(
                        'The "sort" parameter "random" can only be specified '
                        'when the "featured" parameter is also present, and '
                        'the "q" parameter absent.')

        # The default sort depends on the presence of a query: we sort by
        # relevance if we have a query, otherwise by downloads.
        if not order_by:
            sort_param = 'relevance' if search_query_param else 'downloads'
            order_by = [self.SORTING_PARAMS[sort_param]]

        return qs.sort(*order_by)
Beispiel #18
0
def test_field_value_factor():
    q = query.Q('function_score',
                query=query.Q("multi_match",
                              query='python',
                              fields=['title', 'content']),
                functions=[
                    query.SF('field_value_factor',
                             field='votes',
                             modifier='log1p',
                             factor=0.1)
                ],
                score_mode="sum",
                max_boost=1.5)
    s = Post.search()
    s = s.query(q)
    print(s.to_dict())
    response = s.execute()
    for h in response:
        print(h.title)
Beispiel #19
0
    def filter_queryset(self, request, queryset, view):
        search_term = view.query_params.get('q')

        if search_term:
            queries = []
            for query_type, field, boost in self.search_operations:
                queries.append(
                    Q(query_type, **{field: {'query': search_term,
                                             'boost': boost}}))
            queryset = queryset.query(
                'function_score',
                query=query.Bool(should=queries),
                functions=[query.SF('field_value_factor', field='boost')],
            )

        if request.user.is_superuser:
            queryset = queryset.extra(explain=True)

        return queryset
Beispiel #20
0
def test_function_score_with_no_function_is_boost_factor():
    q = query.Q(
        'function_score',
        functions=[query.SF({
            'weight': 20,
            'filter': query.Q('term', f=42)
        })])

    assert {
        'function_score': {
            'functions': [{
                'filter': {
                    'term': {
                        'f': 42
                    }
                },
                'weight': 20
            }]
        }
    } == q.to_dict()
Beispiel #21
0
    def filter_queryset(self, request, queryset, view):
        search_term = view.query_params.get("q")

        if search_term:
            queries = []
            for query_type, field, boost in self.search_operations:
                queries.append(
                    Q(query_type,
                      **{field: {
                          "query": search_term,
                          "boost": boost
                      }}))
            queryset = queryset.query(
                "function_score",
                query=query.Bool(should=queries),
                functions=[query.SF("field_value_factor", field="boost")],
            )

        if request.user.is_superuser:
            queryset = queryset.extra(explain=True)

        return queryset
Beispiel #22
0
    def filter_queryset(self, request, queryset, view):
        search_param = request.QUERY_PARAMS.get(self.search_param, None)

        if search_param:
            queries = []
            for query_type, field, boost in self.search_operations:
                queries.append(
                    Q(query_type,
                      **{field: {
                          'query': search_param,
                          'boost': boost
                      }}))
            queryset = queryset.query(
                'function_score',
                query=query.Bool(should=queries),
                functions=[query.SF('field_value_factor', field='boost')],
            )

        if flag_is_active(request, 'search_explanation'):
            queryset = queryset.extra(explain=True)

        return queryset
Beispiel #23
0
    def apply_search_query(self, search_query, qs, sort=None):
        lang = translation.get_language()
        analyzer = get_locale_analyzer(lang)

        # Our query consist of a number of should clauses. We call the ones
        # with the higher boost "primary" for convenience.
        primary_should = self.primary_should_rules(search_query, analyzer)
        secondary_should = self.secondary_should_rules(search_query, analyzer)

        # We alter scoring depending on the "boost" field which is defined in
        # the mapping (used to boost public addons higher than the rest).
        functions = [
            query.SF('field_value_factor', field='boost'),
        ]

        # Assemble everything together
        qs = qs.query('function_score',
                      query=query.Bool(should=primary_should +
                                       secondary_should),
                      functions=functions)

        if sort is None or sort == 'relevance':
            # If we are searching by relevancy, rescore the top 10
            # (window_size below) results per shard with more expensive rules
            # using match_phrase + slop.
            rescore_query = self.rescore_rules(search_query, analyzer)
            qs = qs.extra(
                rescore={
                    'window_size': 10,
                    'query': {
                        'rescore_query': query.Bool(
                            should=rescore_query).to_dict()
                    }
                })

        return qs
    def filter_queryset(self, request, qs, view):
        search_query_param = request.GET.get('q')
        split_sort_params = self.get_sort_params(request)

        if split_sort_params:
            # Random sort is a bit special.
            # First, it can't be combined with other sorts.
            if 'random' in split_sort_params and len(split_sort_params) > 1:
                raise serializers.ValidationError(
                    'The "random" "sort" parameter can not be combined.'
                )

            # Second, for perf reasons it's only available when the 'featured'
            # or 'promoted' param is present (to limit the number of
            # documents we'll have to apply the random score to) and a search
            # query is absent (to prevent clashing with the score functions
            # coming from a search query).
            if split_sort_params == ['random']:

                is_random_sort_available = (
                    AddonFeaturedQueryParam.query_param in request.GET
                    or AddonPromotedQueryParam.query_param in request.GET
                ) and not search_query_param
                if is_random_sort_available:
                    # We want randomness to change only once every 24 hours, so
                    # we use a seed that depends on the date.
                    qs = qs.query(
                        'function_score',
                        functions=[
                            query.SF(
                                'random_score',
                                seed=date.today().toordinal(),
                            )
                        ],
                    )
                else:
                    raise serializers.ValidationError(
                        'The "sort" parameter "random" can only be specified '
                        'when the "featured" or "promoted" parameter is '
                        'also present, and the "q" parameter absent.'
                    )

            # Sorting by relevance only makes sense with a query string
            if not search_query_param and 'relevance' in split_sort_params:
                split_sort_params = [
                    param for param in split_sort_params if not 'relevance'
                ]

            # Having just recommended sort doesn't make any sense, so ignore it
            if split_sort_params == ['recommended']:
                split_sort_params = None
            # relevance already takes into account recommended so ignore it too
            elif (
                'recommended' in split_sort_params and 'relevance' in split_sort_params
            ):
                split_sort_params = [
                    param for param in split_sort_params if not 'recommended'
                ]

        if not split_sort_params:
            # The default sort depends on the presence of a query: we sort by
            # relevance if we have a query, otherwise by recommended,downloads.
            split_sort_params = (
                ['relevance'] if search_query_param else ['recommended', 'users']
            )

        try:
            order_by = [self.SORTING_PARAMS[name] for name in split_sort_params]
        except KeyError:
            raise serializers.ValidationError('Invalid "sort" parameter.')

        return qs.sort(*order_by)
Beispiel #25
0
    def get_app_filter(cls, request, additional_data=None, sq=None,
                       app_ids=None, no_filter=False):
        """
        THE grand, consolidated ES filter for Webapps. By default:
        - Excludes non-public apps.
        - Excludes disabled apps (whether by reviewer or by developer).
        - Excludes based on region exclusions.
        - TODO: Excludes based on device and platform support.

        additional_data -- an object with more data to allow more filtering.
        sq -- if you have an existing search object to filter off of.
        app_ids -- if you want to filter by a list of app IDs.
        no_filter -- doesn't apply the consumer-side excludes (public/region).
        """
        from mkt.api.base import get_region_from_request
        from mkt.search.views import name_query

        sq = sq or cls.search()
        additional_data = additional_data or {}
        app_ids = app_ids or []

        data = {
            'app_type': [],
            'author.raw': None,
            'category': None,  # Slug.
            'device': None,  # ID.
            'gaia': getattr(request, 'GAIA', False),
            'is_offline': None,
            'manifest_url': '',
            'mobile': getattr(request, 'MOBILE', False),
            'premium_type': [],
            'profile': get_feature_profile(request),
            'q': '',
            'region': getattr(get_region_from_request(request), 'id', None),
            'status': None,
            'supported_locales': [],
            'tablet': getattr(request, 'TABLET', False),
            'tags': '',
        }
        data.update(additional_data)

        # Fields that will be filtered with a term query.
        term_fields = ('author.raw', 'device', 'manifest_url', 'status',
                       'tags')
        # Fields that will be filtered with a terms query.
        terms_fields = ('category', 'premium_type', 'app_type',
                        'supported_locales')

        # QUERY.
        if data['q']:
            # Function score for popularity boosting (defaults to multiply).
            sq = sq.query(
                'function_score',
                query=name_query(data['q'].lower()),
                functions=[query.SF('field_value_factor', field='boost')])

        # MUST.
        must = [
            F('term', status=amo.STATUS_PUBLIC),
            F('term', is_disabled=False),
        ] if not no_filter else []

        for field in term_fields + terms_fields:
            # Term filters.
            if data[field]:
                filter_type = 'term' if field in term_fields else 'terms'
                must.append(F(filter_type, **{field: data[field]}))

        if not no_filter:
            if data['profile']:
                # Feature filters.
                profile = data['profile']
                for k, v in profile.to_kwargs(prefix='features.has_').items():
                    must.append(F('term', **{k: v}))
            if data['mobile'] or data['gaia']:
                # Uses flash.
                must.append(F('term', uses_flash=False))
            if data['is_offline'] is not None:
                must.append(F('term', is_offline=data['is_offline']))

        # SHOULD.
        should = []
        if app_ids:
            should = [es_filter.Terms(id=list(set(app_ids)))]
            sq = sq[0:len(set(app_ids))]

        # FILTER.
        if must or should:
            sq = sq.filter(es_filter.Bool(must=must, should=should))

        if data['region'] and not no_filter:
            # Region exclusions.
            sq = sq.filter(~F('term', region_exclusions=data['region']))

        return sq
Beispiel #26
0
    def filter_queryset(self, request, queryset, view):

        q = request.GET.get('q', '').lower()
        lang = translation.get_language()
        analyzer = self._get_locale_analyzer(lang)

        if not q:
            return queryset

        should = []
        rules = [
            (query.Match, {
                'query': q,
                'boost': 3,
                'analyzer': 'standard'
            }),
            (query.Match, {
                'query': q,
                'boost': 4,
                'type': 'phrase',
                'slop': 1
            }),
            (query.Prefix, {
                'value': q,
                'boost': 1.5
            }),
        ]

        # Only add fuzzy queries if q is a single word. It doesn't make sense
        # to do a fuzzy query for multi-word queries.
        if ' ' not in q:
            rules.append((query.Fuzzy, {
                'value': q,
                'boost': 2,
                'prefix_length': 1
            }))

        # Apply rules to search on few base fields. Some might not be present
        # in every document type / indexes.
        for k, v in rules:
            for field in ('app_slug', 'author', 'name', 'short_name', 'slug',
                          'title', 'url_tokenized'):
                should.append(k(**{field: v}))

        # Exact matches need to be queried against a non-analyzed field. Let's
        # do a term query on `name.raw` for an exact match against the item
        # name and give it a good boost since this is likely what the user
        # wants.
        # FIXME: we should also do that on translations and slug/app_slug, but
        # we don't store a raw version for them at the moment.
        should.append(query.Term(**{'name.raw': {'value': q, 'boost': 10}}))
        # Do the same for GUID searches.
        should.append(query.Term(**{'guid': {'value': q, 'boost': 10}}))
        # If query is numeric, check if it is an ID.
        if q.isnumeric():
            should.append(query.Term(**{'id': {'value': q, 'boost': 10}}))

        if analyzer:
            should.append(
                query.Match(
                    **{'name_l10n_%s' % analyzer: {
                        'query': q,
                        'boost': 2.5
                    }}))
            should.append(
                query.Match(**{
                    'short_name_l10n_%s' % analyzer: {
                        'query': q,
                        'boost': 2.5
                    }
                }))

        # Add searches on the description field.
        should.append(
            query.Match(description={
                'query': q,
                'boost': 0.8,
                'type': 'phrase'
            }))

        if analyzer:
            desc_field = 'description_l10n_%s' % analyzer
            desc_analyzer = ('%s_analyzer' % analyzer
                             if analyzer in mkt.STEMMER_MAP else analyzer)
            should.append(
                query.Match(
                    **{
                        desc_field: {
                            'query': q,
                            'boost': 0.6,
                            'type': 'phrase',
                            'analyzer': desc_analyzer
                        }
                    }))

        # Add searches on tag field.
        should.append(query.Term(tags={'value': q}))
        if ' ' not in q:
            should.append(query.Fuzzy(tags={'value': q, 'prefix_length': 1}))

        # The list of functions applied to our `function_score` query.
        functions = [
            query.SF('field_value_factor', field='boost'),
        ]

        # Add a boost for the preferred region, if it exists.
        region = get_region_from_request(request)
        if region:
            functions.append({
                'filter': {
                    'term': {
                        'preferred_regions': region.id
                    }
                },
                # TODO: When we upgrade to Elasticsearch 1.4, change this
                # to 'weight'.
                'boost_factor': 4,
            })

        return queryset.query('function_score',
                              query=query.Bool(should=should),
                              functions=functions)
Beispiel #27
0
def _find(params,
          total_only=False,
          make_suggestions=False,
          min_suggestion_score=0.8):
    search_query = Search(index=settings.SEARCH_INDEX_NAME, )
    if make_suggestions:
        # XXX research if it it's better to use phrase suggesters and if
        # that works
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#phrase-suggester
        search_query = search_query.suggest("title_suggestions",
                                            params["query"],
                                            term={"field": "title"})
        search_query = search_query.suggest("body_suggestions",
                                            params["query"],
                                            term={"field": "body"})

    sub_queries = []
    sub_queries.append(
        Q("match", title={
            "query": params["query"],
            "boost": 2.0
        }))
    sub_queries.append(
        Q("match", body={
            "query": params["query"],
            "boost": 1.0
        }))
    if " " in params["query"]:
        sub_queries.append(
            Q("match_phrase", title={
                "query": params["query"],
                "boost": 10.0
            }))
        sub_queries.append(
            Q("match_phrase", body={
                "query": params["query"],
                "boost": 5.0
            }))

    sub_query = query.Bool(should=sub_queries)

    if params["locales"]:
        search_query = search_query.filter("terms", locale=params["locales"])
    if params["archive"] == "exclude":
        search_query = search_query.filter("term", archived=False)
    elif params["archive"] == "only":
        search_query = search_query.filter("term", archived=True)

    if params["slug_prefixes"]:
        sub_queries = [Q("prefix", slug=x) for x in params["slug_prefixes"]]
        search_query = search_query.query(query.Bool(should=sub_queries))

    search_query = search_query.highlight_options(
        pre_tags=["<mark>"],
        post_tags=["</mark>"],
        number_of_fragments=3,
        fragment_size=120,
        encoder="html",
    )
    search_query = search_query.highlight("title", "body")

    if params["sort"] == "relevance":
        search_query = search_query.sort("_score", "-popularity")
        search_query = search_query.query(sub_query)
    elif params["sort"] == "popularity":
        search_query = search_query.sort("-popularity", "_score")
        search_query = search_query.query(sub_query)
    else:
        popularity_factor = 10.0
        boost_mode = "sum"
        score_mode = "max"
        search_query = search_query.query(
            "function_score",
            query=sub_query,
            functions=[
                query.SF(
                    "field_value_factor",
                    field="popularity",
                    factor=popularity_factor,
                    missing=0.0,
                )
            ],
            boost_mode=boost_mode,
            score_mode=score_mode,
        )

    search_query = search_query.source(excludes=["body"])

    search_query = search_query[params["size"] *
                                (params["page"] - 1):params["size"] *
                                params["page"]]

    retry_options = {
        "retry_exceptions": (
            # This is the standard operational exception.
            exceptions.ConnectionError,
            # This can happen if the search happened right as the index had
            # just been deleted due to a fresh re-indexing happening in Yari.
            exceptions.NotFoundError,
            # This can happen when the index simply isn't ready yet.
            exceptions.TransportError,
        ),
        # The default in redo is 60 seconds. Let's tone that down.
        "sleeptime":
        settings.ES_RETRY_SLEEPTIME,
        "attempts":
        settings.ES_RETRY_ATTEMPTS,
        "jitter":
        settings.ES_RETRY_JITTER,
    }
    with retrying(search_query.execute, **retry_options) as retrying_function:
        response = retrying_function()

    if total_only:
        return response.hits.total

    metadata = {
        "took_ms": response.took,
        "total": {
            # The `response.hits.total` is a `elasticsearch_dsl.utils.AttrDict`
            # instance. Pluck only the exact data needed.
            "value": response.hits.total.value,
            "relation": response.hits.total.relation,
        },
        "size": params["size"],
        "page": params["page"],
    }
    documents = []
    for hit in response:
        try:
            body_highlight = list(hit.meta.highlight.body)
        except AttributeError:
            body_highlight = []
        try:
            title_highlight = list(hit.meta.highlight.title)
        except AttributeError:
            title_highlight = []

        d = {
            "mdn_url": hit.meta.id,
            "score": hit.meta.score,
            "title": hit.title,
            "locale": hit.locale,
            "slug": hit.slug,
            "popularity": hit.popularity,
            "archived": hit.archived,
            "summary": hit.summary,
            "highlight": {
                "body": body_highlight,
                "title": title_highlight,
            },
        }
        documents.append(d)

    try:
        suggest = getattr(response, "suggest")
    except AttributeError:
        suggest = None

    suggestions = []
    if suggest:
        suggestion_strings = _unpack_suggestions(
            params["query"],
            response.suggest,
            ("body_suggestions", "title_suggestions"),
        )

        for score, string in suggestion_strings:
            if score > min_suggestion_score or 1:
                # Sure, this is different way to spell, but what will it yield
                # if you actually search it?
                total = _find(dict(params, query=string), total_only=True)
                if total["value"] > 0:
                    suggestions.append({
                        "text": string,
                        "total": {
                            # This 'total' is an `AttrDict` instance.
                            "value": total.value,
                            "relation": total.relation,
                        },
                    })
                    # Since they're sorted by score, it's usually never useful
                    # to suggestion more than exactly 1 good suggestion.
                    break

    return {
        "documents": documents,
        "metadata": metadata,
        "suggestions": suggestions,
    }