def clusters(): """ Отображает AJAX-версию страницы с кластеризацией. Предназначено для замены display_clusters() после тестирования. """ person = request.values.get('filter') print(person) Face._index.refresh() total = Face.search().count() named = Face.search().filter("exists", field="person").count() status = "{:.1%} ({} out of {}) faces are named. Clusters count: {}".format( named / total, named, total, Cluster.search().count()) a = A("terms", field="person.raw", size=10000) ps = Search() ps.aggs.bucket("persons", a) psr = ps.execute() persons = [b.key for b in psr.aggs['persons']] if person: s = Cluster.search().filter("prefix", person=person).sort("-face_count") results = s[0:10000].execute() else: s = Cluster.search().exclude("exists", field="person") s.query = FunctionScore(query=s.query, functions=[SF('random_score', weight=100), SF('field_value_factor', field="face_count", weight=1)], score_mode="avg", boost_mode="replace") results = s[0:50].execute() return render_template('clusters.html', clusters=results, persons=persons, status=status)
def query(self, search, query): if not query: return search # query in tags, title and body for query q = Q('multi_match', fields=['tags^10', 'title', 'body'], query=query) # also find questions that have answers matching query q |= Q('has_child', type='answer', query=Q('match', body=query), inner_hits={ 'highlight': { "pre_tags": ["[[["], "post_tags": ["]]]"], 'fields': { 'body': { 'fragment_size': 30 } } }, '_source': False, 'size': 1 }) # take the rating field into account when sorting search = search.query( 'function_score', query=q, functions=[SF('field_value_factor', field='rating')]) return search
def _compute_decay_functions(): decay_function = SF('gauss', publication_datetime=dict(origin='now', scale='30d', offset='7d', decay='0.1')) return [decay_function]
def query(self, search, query): if query: return search.query("simple_query_string", fields=self.fields, query=query, default_operator='and') else: search.query = FunctionScore( query=Q(), functions=[SF('random_score', seed=int(time.time()))]) return search
def query(self, search, query): if not query: return search # query in tags, title and body for query q = Q('multi_match', fields=['title', 'abstract'], query=query) # take the title field into account when sorting search = search.query( 'function_score', query=q, functions=[SF('field_value_factor', field='title')]) return search
def random(request): """ Redirect to a random case over 1,000 words. """ s = CaseDocument.search().source(['frontend_url']).filter('range', analysis__word_count={'gte':1000}) s.query = FunctionScore( query=s.query, # omit this if not applying a filter first functions=[ SF('random_score'), # to weight by pagerank: # SF('field_value_factor', field='analysis.pagerank.percentile', modifier="ln1p", missing=0) ], boost_mode='replace', ) random_case = s[0].execute()[0] return HttpResponseRedirect(random_case.frontend_url)
def api_search(search: Search, query: APIQuery) -> Search: """ Prepare a :class:`.Search` from a :class:`.APIQuery`. Parameters ---------- search : :class:`.Search` An Elasticsearch search in preparation. query : :class:`.APIQuery` An query originating from the API. Returns ------- :class:`.Search` The passed ES search object, updated with specific query parameters that implement the advanced query. """ # Classification and date are treated as filters; this foreshadows the # behavior of faceted search. if not query.include_older_versions: search = search.filter("term", is_current=True) _q_clsn = Q() if query.primary_classification: _q_clsn &= reduce( ior, map(query_primary_exact, list(query.primary_classification))) if query.secondary_classification: for classification in query.secondary_classification: _q_clsn &= reduce(ior, map(query_secondary_exact, list(classification))) q = _fielded_terms_to_q(query) & _date_range(query) & _q_clsn if query.order is None or query.order == "relevance": # Boost the current version heavily when sorting by relevance. q = Q( "function_score", query=q, boost=5, boost_mode="multiply", score_mode="max", functions=[ SF({ "weight": 5, "filter": Q("term", is_current=True) }) ], ) search = sort(query, search) search = search.query(q) return search
def advanced_search(search: Search, query: AdvancedQuery) -> Search: """ Prepare a :class:`.Search` from a :class:`.AdvancedQuery`. Parameters ---------- search : :class:`.Search` An Elasticsearch search in preparation. query : :class:`.AdvancedQuery` A query originating from the advanced search UI. Returns ------- :class:`.Search` The passed ES search object, updated with specific query parameters that implement the advanced query. """ # Classification and date are treated as filters; this foreshadows the # behavior of faceted search. if not query.include_older_versions: search = search.filter("term", is_current=True) _q_clsn = limit_by_classification(query.classification) if query.include_cross_list: _q_clsn |= limit_by_classification(query.classification, "secondary_classification") q = (_fielded_terms_to_q(query) & _date_range(query) & _q_clsn) if query.order is None or query.order == 'relevance': # Boost the current version heavily when sorting by relevance. q = Q('function_score', query=q, boost=5, boost_mode="multiply", score_mode="max", functions=[ SF({ 'weight': 5, 'filter': Q('term', is_current=True) }) ]) search = sort(query, search) search = search.query(q) return search
def get_featured_websites(self): """ Get up to 11 featured MOWs for the request's region. If less than 11 are available, make up the difference with globally-featured MOWs. """ REGION_TAG = 'featured-website-%s' % self.request.REGION.slug region_filter = es_filter.Term(tags=REGION_TAG) GLOBAL_TAG = 'featured-website' global_filter = es_filter.Term(tags=GLOBAL_TAG) mow_query = query.Q( 'function_score', filter=es_filter.Bool(should=[region_filter, global_filter]), functions=[ SF('random_score', seed=self._get_daily_seed()), es_function.BoostFactor(value=100.0, filter=region_filter) ], ) es = Search(using=WebsiteIndexer.get_es())[:11] results = es.query(mow_query).execute().hits return ESWebsiteSerializer(results, many=True).data
def filter_queryset(self, request, queryset, view): daily_seed = int(datetime.datetime.now().strftime('%Y%m%d')) # Map over the game categories to create a function score query for one # and dump it into a Bool should. game_query = query.Q( 'function_score', filter=es_filter.Bool( should=[es_filter.Term(tags=cat) for cat in GAME_CATEGORIES]), # Consistently random based on the day. functions=[SF('random_score', seed=daily_seed)], ) # Buckets by tag. Run a size=1 TopHits aggregation to only select one # game from each tag. Results will have to be pulled out of # S.execute().aggregations rather than S.execute().hits. top_hits = aggs.TopHits(size=1) a = aggs.A('terms', field='tags', aggs={'first_game': top_hits}) queryset = queryset.query(game_query)[0:4] queryset.aggs.bucket('top_hits', a) # Not chainable. return queryset
def build_q1(self, keyword): if not keyword: return Q() sq1_1 = Q('match_phrase', fullname=keyword) sq1_2 = Q('match_phrase', enname=keyword) sq1_3 = Q('match_phrase', area=keyword) sq1_4 = Q('match_phrase', industry=keyword) ssq1 = Q('match_phrase', symbol=keyword) ssq2 = Q('match_phrase', display=keyword) q = Q('function_score', query=Q(), boost=1, score_mode='max', min_score=2, functions=[ SF({'weight': 5, 'filter': ssq1}), SF({'weight': 5, 'filter': ssq2}), SF({'weight': 1,'filter': sq1_1}), SF({'weight': 1,'filter': sq1_2}), SF({'weight': 1,'filter': sq1_3}), SF({'weight':3, 'filter': sq1_4}), ]) return q
def _query_all_fields(term: str) -> Q: """ Construct a query against all fields. The heart of the query is a `query_string` search against a "combined" field, which contains tokens from all of the searchable metadata fields on each paper. All tokens in the query must match in that combined field. The reason that we do it this way, instead of combining queries across multiple fields, is that: - To query in a term-centric way across fields (e.g. the `cross_fields` query type for `query_string` or `multi_match` searches), all of those fields must have the same analyzer. It's a drag to constrain analyzer choice on individual fields, so this way we can do what we want with individual fields but also support a consistent all-fields search that behaves the way that users expect. - Performing a disjunct search across all fields can't guarantee that all terms match (if we use the disjunct operator within each field), and can't handle queries that span fieds (if we use the conjunect operator within each field). In addition to the combined query, we also perform dijunct queries across individual fields to generate field-specific hits, and to provide control over scoring. Weights are applied using :class:`.SF` (score functions). In the current implementation, fields are given monotonically decreasing weights in the order applied below. More complex score functions may be introduced, and that should happen here. Parameters ---------- term : str A query string. Returns ------- :class:`.Q` A search-ready query part, including score functions. """ # We only perform TeX queries on title and abstract. if is_tex_query(term): return _tex_query('title', term) | _tex_query('abstract', term) date_partial: Optional[str] = None remainder: Optional[str] = None try: date_partial, remainder = match_date_partial(term) logger.debug(f'found date partial: {date_partial}') except ValueError: pass logger.debug(f'partial: {date_partial}; rem: {remainder}') match_all_fields = _query_combined(term) if date_partial: _q = Q("term", announced_date_first=date_partial) if remainder: _q &= _query_combined(remainder) match_all_fields |= _q # We include matches of any term in any field, so that we can highlight # and score appropriately. queries = [ _query_paper_id(term, operator='or'), author_query(term, operator='OR'), _query_title(term, default_operator='or'), _query_abstract(term, default_operator='or'), _query_comments(term, default_operator='or'), orcid_query(term, operator='or'), author_id_query(term, operator='or'), _query_doi(term, operator='or'), _query_journal_ref(term, operator='or'), _query_report_num(term, operator='or'), _query_acm_class(term, operator='or'), _query_msc_class(term, operator='or'), _query_primary(term, operator='or') ] if date_partial: queries.insert(0, Q("term", announced_date_first=date_partial)) # If the whole query matches on a specific field, we should consider that # responsive even if the query on the combined field does not respond. conj_queries = [ _query_paper_id(term, operator='AND'), author_query(term, operator='AND'), _query_title(term, default_operator='and'), _query_abstract(term, default_operator='and'), _query_comments(term, default_operator='and'), orcid_query(term, operator='and'), author_id_query(term, operator='and'), _query_doi(term, operator='and'), _query_journal_ref(term, operator='and'), _query_report_num(term, operator='and'), _query_acm_class(term, operator='and'), _query_msc_class(term, operator='and'), _query_primary(term, operator='and') ] query = (match_all_fields | reduce(ior, conj_queries)) query &= Q("bool", should=queries) # Partial matches across fields. scores = [ SF({ 'weight': i + 1, 'filter': q }) for i, q in enumerate(queries[::-1]) ] return Q('function_score', query=query, score_mode="sum", functions=scores, boost_mode='multiply')
def _query_all_fields(term: str) -> Q: """ Construct a query against all fields. The heart of the query is a `query_string` search against a "combined" field, which contains tokens from all of the searchable metadata fields on each paper. All tokens in the query must match in that combined field. The reason that we do it this way, instead of combining queries across multiple fields, is that: - To query in a term-centric way across fields (e.g. the `cross_fields` query type for `query_string` or `multi_match` searches), all of those fields must have the same analyzer. It's a drag to constrain analyzer choice on individual fields, so this way we can do what we want with individual fields but also support a consistent all-fields search that behaves the way that users expect. - Performing a disjunct search across all fields can't guarantee that all terms match (if we use the disjunct operator within each field), and can't handle queries that span fieds (if we use the conjunect operator within each field). In addition to the combined query, we also perform dijunct queries across individual fields to generate field-specific hits, and to provide control over scoring. Weights are applied using :class:`.SF` (score functions). In the current implementation, fields are given monotonically decreasing weights in the order applied below. More complex score functions may be introduced, and that should happen here. Parameters ---------- term : str A query string. Returns ------- :class:`.Q` A search-ready query part, including score functions. """ # We only perform TeX queries on title and abstract. if is_tex_query(term): return _tex_query('title', term) | _tex_query('abstract', term) match_all_fields = _query_combined(term) # We include matches of any term in any field, so that we can highlight # and score appropriately. queries = [ _query_paper_id(term, operator='or'), author_query(term, operator='or'), _query_title(term, default_operator='or'), _query_abstract(term, default_operator='or'), _query_comments(term, default_operator='or'), orcid_query(term, operator='or'), author_id_query(term, operator='or'), _query_doi(term, operator='or'), _query_journal_ref(term, operator='or'), _query_report_num(term, operator='or'), _query_acm_class(term, operator='or'), _query_msc_class(term, operator='or'), _query_primary(term, operator='or'), _query_secondary(term, operator='or'), ] # If the whole query matches on a specific field, we should consider that # responsive even if the query on the combined field does not respond. match_individual_field = reduce(ior, [ _query_paper_id(term, operator='AND'), author_query(term, operator='AND'), _query_title(term, default_operator='and'), _query_abstract(term, default_operator='and'), _query_comments(term, default_operator='and'), orcid_query(term, operator='and'), author_id_query(term, operator='and'), _query_doi(term, operator='and'), _query_journal_ref(term, operator='and'), _query_report_num(term, operator='and'), _query_acm_class(term, operator='and'), _query_msc_class(term, operator='and'), _query_primary(term, operator='and'), _query_secondary(term, operator='and') ]) # It is possible that the query includes a date-related term, which we # interpret as an announcement date of v1 of the paper. We currently # support both "standard" `yyyy` or `yyyy-MM`` formats as well as a # legacy format ``yyMM``. # # The general strategy here is to first attempt to match a date fragment # using one the formats above, and split the query so that we can handle # the date fragment and the remainder of the query separately. If we find # something that looks like a date fragment, we perform the all-fields # search on the remainder and use the fragment to build queries against the # announcement-date of the original paper version. date_fragment: Optional[str] = None remainder: Optional[str] = None try: date_fragment, remainder = parse_date(term) except ValueError: pass if date_fragment: logger.debug('date: %s; remainder: %s', date_fragment, remainder) match_date: Optional[Q] = None match_date_partial: Optional[Q] = None match_date_announced: Optional[Q] = None match_dates: List[Q] = [] logger.debug('date_fragment: %s', date_fragment) # Try to query using legacy yyMM date partial format. date_partial = parse_date_partial(date_fragment) logger.debug('date_partial: %s', date_partial) if date_partial is not None: match_date_partial = Q("term", announced_date_first=date_partial) match_dates.append(match_date_partial) # Also try using yyyy-MM and yyyy formats. match_date_announced = _query_announcement_date(date_fragment) if match_date_announced: match_dates.append(match_date_announced) # Build the composite announcement date query here, using the # sub-queries based on "standard" and legay date formats. if match_dates: # The only way to know in the end whether the query matched on # the announcement date is to wrap this in a top-level query and # give it a ``_name``. This causes the ``_name`` to show up # in the ``.meta.matched_queries`` property on the search result. match_date = Q("bool", should=match_dates, minimum_should_match=1, _name="announced_date_first") logger.debug('match date: %s', match_date) queries.insert(0, match_date) # Now join the announcement date query with the all-fields queries. if match_date is not None: if remainder: match_remainder = _query_combined(remainder) match_all_fields |= (match_remainder & match_date) match_sans_date = reduce(ior, [ _query_paper_id(remainder, operator='AND'), author_query(remainder, operator='AND'), _query_title(remainder, default_operator='and'), _query_abstract(remainder, default_operator='and'), _query_comments(remainder, default_operator='and'), orcid_query(remainder, operator='and'), author_id_query(remainder, operator='and'), _query_doi(remainder, operator='and'), _query_journal_ref(remainder, operator='and'), _query_report_num(remainder, operator='and'), _query_acm_class(remainder, operator='and'), _query_msc_class(remainder, operator='and'), _query_primary(remainder, operator='and'), _query_secondary(remainder, operator='and') ]) match_individual_field |= (match_sans_date & match_date) else: match_all_fields |= match_date query = (match_all_fields | match_individual_field) query &= Q("bool", should=queries) # Partial matches across fields. scores = [ SF({ 'weight': i + 1, 'filter': q }) for i, q in enumerate(queries[::-1]) ] return Q('function_score', query=query, score_mode="sum", functions=scores, boost_mode='multiply')
#%% def query_ngram(ngram=None, index=["now"], case=False, page_from=0, page_to=10, fields=None): index = ",".join(index) if case: q = Q("match_phrase", token={"query": ngram}) else: q = Q("match_phrase", token_insensitive={"query": ngram}) rand_q = Q("function_score", functions=[SF("random_score")]) b = Sentence().search(index=index).query(q & rand_q) b = b.source(fields) res = b[page_from:page_to].execute() res = res.to_dict()["hits"]["hits"] return res #%% def query_document(id_=None, index="now", fields=None): # index = ",".join(index) page = Page().get(id_, index=index) # sentence = Sentence().get(child_id, index=index) # page = Page().get(sentence.meta.routing, index=index)
# coding=utf-8 from datetime import datetime, timedelta from elasticsearch_dsl import (DocType, Date, Integer, Text, Float, Boolean, Keyword, SF, Q, A, Completion) from elasticsearch_dsl.connections import connections from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer from config import SEARCH_FIELDS from .speaker import User, session connections.create_connection(hosts=['localhost']) gauss_sf = SF('gauss', starts_at={ 'origin': 'now', 'offset': '7d', 'scale': '10d' }) log_sf = SF( 'script_score', script={ 'lang': 'painless', 'inline': ("Math.log10(doc['seats_taken'].value * doc['amount'].value) * " "doc['feedback_score'].value") }) class CustomAnalyzer(_CustomAnalyzer): def get_analysis_definition(self):
from models.core import Post connections.create_connection(hosts=ES_HOSTS) # item.id, item.kind 获取对象并缓存 ITEM_MC_KEY = 'search:get({},{})' # tag, page, order, per_page 只搜索热点post,缓存期为一个小时 POST_IDS_BY_TAG_MC_KEY = 'search:get_post_ids_by_tag(%s,%s,%s,%s)' SEARCH_FIELDS = ['title^10', 'tags^5', 'content^2'] TARGET_MAPPER = {K_POST: Post} gauss_sf = SF('gauss', created_at={ 'origin': 'now', 'offset': '7d', 'scale': '10d' }) score_sf = SF('script_score', script={ 'lang': 'painless', 'inline': ("doc['n_likes'].value * 2 + doc['n_collects'].value") }) def get_item_data(item): """ 返回格式化的数据字典
def evaluate_score(student, client, num_resp: int = 25): """Takes a student, represented as a dictionary and an elasticsearch-py client and returns an elastic response See above student class for schema """ # Adjust weights here: base_score = 1.0 company_score = 1.0 rural_score = 2.0 tags_score = 3.0 underrep_score = 1.0 # Timezone weights are found in the timezone script query s = elasticsearch_dsl.Search(using=client, index="mentors_index").extra(explain=True) # Start by filtering the search by track s = s.filter("term", track=student["track"]) # And also by requireExtended if student["requireExtended"]: s = s.filter("term", okExtended="true") if not student["underrepresented"]: s = s.exclude("term", preferStudentUnderRep=2) # Adds one to all remaining entries in order to be sure that, in the worst case, # there are enough responses, even if they aren't a good fit base_value = Q("constant_score", filter=MatchAll(), boost=base_score) # Uses a fuzzy query to determine if a student is interested in the mentor's company, # then if so adds `weight` to the score company_q = None for company in student["interestCompanies"]: if company_q is None: company_q = Q( "function_score", query=Q("fuzzy", company=company), weight=company_score, boost_mode="replace", ) else: company_q = company_q | Q( "function_score", query=Q("fuzzy", company=company), weight=company_score, boost_mode="replace", ) if student["rural"]: # If background_rural matches on mentor and student, then add one to the score background_rural = Q("constant_score", filter=Q("term", backgroundRural=student["rural"]), boost=rural_score) else: background_rural = Q("constant_score", filter=MatchNone()) # Adds `weight` * the number of matching tags to score tags_matching = None num_interests = len(student["interestTags"]) for interest in student["interestTags"]: if tags_matching is None: tags_matching = Q( "function_score", query=Q("term", proj_tags=interest), weight=tags_score / num_interests, boost_mode="replace", ) else: tags_matching = tags_matching | Q( "function_score", query=Q("term", proj_tags=interest), weight=tags_score / num_interests, boost_mode="replace", ) combined_query = ( base_value | tags_matching | company_q | background_rural # | prefer_student_underrep ) # Decay the combined score based on the number of students who already voted for that combined_query = Q("function_score", query=combined_query, functions=SF("gauss", numStudentsSelected={ "origin": 0, "scale": 3, "offset": 3, "decay": 0.50 })) # Timezone - this one's a bit more complex. See comments in script for more details. # Multiplies it's value by the previous scores, allowing it to reduce, set to zero, and increase scores. # See below string for python implementation """ if mentor['okTimezoneDifference']: if 16 < student['timezone'] < 22: return True return false else: if abs(student['timezone'] - mentor['timezone']) < 3: return True return False """ s = s.query(combined_query)[0:num_resp] resp = s.execute() return resp
from config import ES_HOSTS, PER_PAGE from corelib.mc import cache, rdb from .consts import K_POST, ONE_HOUR from .core import Post connections.create_connection(hosts=ES_HOSTS) MC_KEY_ITEM = "core:search:{}:{}" MC_KEY_POST_IDS_BY_TAG = "core:search:post_ids_by_tag:{}:{}:{}:{}" SERACH_FIELDS = ["title^10", "tags^5", "content^2"] TARGET_MAPPER = {K_POST: Post} gauss_sf = SF("gauss", created_at={ "origin": "now", "offset": "7d", "scale": "10d" }) score_sf = SF( "script_score", script={ "lang": "painless", "inline": ("doc['n_likes'].value*2 +doc['n_collects'].value"), }, ) def get_item_data(item): try: content = item.content except AttributeError: