def get_by_unique_key(self, unique_key, name): term = 'unique_keys.' + unique_key # had to use ** kw notation because of variable in field name search = Search(using=self.es) search = search.filter('term', **{term: name}) search = search.extra(version=True) return self._one(search)
def get_rev_links(self, model, rel, *item_types): search = Search(using=self.es) search = search.extra(size=SEARCH_MAX) # rel links use '~' instead of '.' due to ES field restraints proc_rel = rel.replace('.', '~') # had to use ** kw notation because of variable in field name search = search.filter('term', **{'links.' + proc_rel: str(model.uuid)}) if item_types: search = search.filter('terms', item_type=item_types) hits = search.execute() return [hit.to_dict().get('uuid', hit.to_dict().get('_id')) for hit in hits]
def search(self, **params): limit_cat = params.get('cat', "").strip() limit_forum = params.get('forum', "").strip() limit_count = int(params.get('count', 100)) limit_size_min = human2bytes(params.get('min', "0b")) limit_size_max = human2bytes(params.get('max', "0b")) limit_wild = int(params.get('wild', 0)) arg = params.get('query', '').strip() if not arg: arg = "hobbit" s = Search(using=es, index=ela_index) if limit_size_min: s = s.filter("range", size = {'gte' : limit_size_min }) if limit_size_max: s = s.filter("range", size = {'lte' : limit_size_max }) arg = arg.split(' ') if limit_wild: q = Q("wildcard", name="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", name="*"+a+"*") else: q = Q("match", name=arg.pop(0)) for a in arg: q = q & Q("match", name=a) if len(limit_cat): for a in limit_cat.split(' '): q = q & Q("match", category=a) if len(limit_forum): for a in limit_forum.split(' '): q = q & Q("match", forum=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total #cherrypy.log("query have "+str(size)+" elements") if size > limit_count: size = limit_count s = s.sort('-size') s = s.extra(size=size) r = s.execute() data = [] for b in r: a = [b.id, b.size, b.name, b.category, b.forum, b.date[0] if b.date else '', b.hash] data.append(a) return {'data': data}
def avg_bytes_per_request(): # return the average size of the object returned to the client for each URL avg_bytes_per_request = {} s = Search(index="my_index") s.aggs.bucket("per_request", "terms", field="request.untouched", size=0).metric( "avg_bytes", "avg", field="bytes" ) # group) results by URL, then average the byte size response = s.extra(size=10000).execute() for per_request in response.aggregations.per_request.buckets: avg_bytes_per_request[ per_request.key ] = ( per_request.avg_bytes.value ) # fill the dictionary with each different URL associated with the average size of the object they return return avg_bytes_per_request
def search(self, **params): limit_author = params.get('author', "").strip() limit_title = params.get('title', "").strip() limit_count = int(params.get('count', 10)) limit_wild = int(params.get('wild', 0)) q = None if not limit_author and not limit_title: limit_title = "hobbit" s = Search(using=es, index=ela_index) arg = limit_title.split(' ') arg = [x for x in arg if x] if len(arg): if limit_wild: q = Q("wildcard", title="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", title="*"+a+"*") else: q = Q("match", title=arg.pop(0)) for a in arg: q = q & Q("match", title=a) arg = limit_author.split(' ') arg = [x for x in arg if x] if len(arg): for a in arg: if q: q = q & Q("match", author=a) else: q = Q("match", author=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total if size > limit_count: size = limit_count s = s.sort('-date') s = s.extra(size=size) r = s.execute() #cherrypy.log("result is "+str(r)) data = [] for b in r: a = [b.id, b.author, b.title, b.size, b.date] data.append(a) return {'data': data}
def avg_nb_con_per_request_per_clientip(): # return the average number of requests made by each client to each page avg_nb_con_per_request_per_clientip = {} # initiate the return dictionary s = Search(index="my_index") s.aggs.bucket("per_request", "terms", field="request.untouched", size=0).bucket( "per_clientip", "terms", field="clientip", size=10000 ) # group results by URL, then by client IP response = s.extra(size=10000).execute() # execute the query for per_request in response.aggregations.per_request.buckets: # for each distinct URL nb_con_per_request = 0 # initiate the occurences counter for per_clientip in per_request.per_clientip.buckets: # for each distinct client IP nb_con_per_request += per_clientip.doc_count # count the occurrences avg_nb_con_per_request_per_clientip[per_request.key] = nb_con_per_request / len( per_request.per_clientip.buckets ) # fill the dictionary with each different URL associated with its per client consultation average return avg_nb_con_per_request_per_clientip
def search_command(): """Performs a search in Elasticsearch.""" index = demisto.args().get('index') query = demisto.args().get('query') fields = demisto.args().get('fields') # fields to display explain = 'true' == demisto.args().get('explain') base_page = int(demisto.args().get('page')) size = int(demisto.args().get('size')) sort_field = demisto.args().get('sort-field') sort_order = demisto.args().get('sort-order') es = elasticsearch_builder() que = QueryString(query=query) search = Search(using=es, index=index).query(que)[base_page:base_page + size] if explain: # if 'explain parameter is set to 'true' - adds explanation section to search results search = search.extra(explain=True) if fields is not None: fields = fields.split(',') search = search.source(fields) if sort_field is not None: search = search.sort({sort_field: {'order': sort_order}}) response = search.execute().to_dict() total_dict, total_results = get_total_results(response) search_context, meta_headers, hit_tables, hit_headers = results_to_context( index, query, base_page, size, total_dict, response) search_human_readable = tableToMarkdown('Search Metadata:', search_context, meta_headers, removeNull=True) hits_human_readable = tableToMarkdown('Hits:', hit_tables, hit_headers, removeNull=True) total_human_readable = search_human_readable + '\n' + hits_human_readable full_context = { 'Elasticsearch.Search(val.Query == obj.Query && val.Index == obj.Index ' '&& val.Server == obj.Server && val.Page == obj.Page && val.Size == obj.Size)': search_context } return_outputs(total_human_readable, full_context, response)
def referrers_per_request(): # return the count of occurrences for each page/previous page couple referrers_per_request = {} s = Search(index="my_index") s.aggs.bucket("per_request", "terms", field="request.untouched", size=0).bucket( "per_referrer", "terms", field="referrer.untouched", size=10000 ) response = s.extra(size=10000).execute() for per_request in response.aggregations.per_request.buckets: # for each distinct URL referrers_per_request[per_request.key] = {} for per_referrer in per_request.per_referrer.buckets: # for each distinct previous page referrers_per_request[per_request.key][ per_referrer.key ] = ( per_referrer.doc_count ) # fill the dictionary with each different URL associated with each different previous one with the number of occurrences for each of these relationships return referrers_per_request
def get_already_imported_ids(es, es_index_prefix, es_type_name): """ Returns existing EL-ids of provided index and type. :param es: es-connection instance :param es_index_prefix: ``str`` Index prefix :param es_type_name: ``str`` ES document type name :return: ``set`` Set of already imported ids (Read time if id exists: O(1)) """ index_name = es_index_prefix.format('*') s = Search(using=es, index=index_name, doc_type=es_type_name) s = s.extra(stored_fields=[]) ids = set() for h in s.scan(): ids.add(h.meta.id) return ids
def get_inbound_refs( es_client: Any, release_ident: Optional[str] = None, work_ident: Optional[str] = None, openlibrary_work: Optional[str] = None, url: Optional[str] = None, consolidate_works: bool = False, filter_stage: List[str] = [], sort: Optional[str] = None, limit: int = 25, offset: Optional[int] = None, es_index: str = "fatcat_ref", ) -> RefHits: search = Search(using=es_client, index=es_index) if consolidate_works: search = search.extra( collapse={ "field": "source_work_ident", "inner_hits": { "name": "source_more", "size": 0, }, }) if release_ident: search = search.filter("term", target_release_ident=release_ident) elif work_ident: search = search.filter("term", target_work_ident=work_ident) elif openlibrary_work: search = search.filter("term", target_openlibrary_work=openlibrary_work) else: raise ValueError("require a lookup key") if filter_stage: search = search.filter("term", source_stage=filter_stage) if sort == "newest": search = search.sort("-source_year") elif sort == "oldest": search = search.sort("source_year") else: search = search.sort("-source_year") return _execute_ref_query(search, limit=limit, offset=offset)
def get_queryset(self, queryset, data): phrase = data.get('q') if 'models' not in data: models = self._supported_models else: models = data['models'].split(',') advanced = data.get('advanced') op, suffix = get_advanced_options(advanced) lang = get_language() per_model = data.get('per_model', 1) ms = MultiSearch(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME) for model in models: if is_enabled('S39_filter_by_geodata.be' ) and model in self._completion_models: sug_query = Search(index=f'{model}s') sug_query = sug_query.suggest('title', phrase, completion={ 'field': f'title.{lang}.suggest', 'size': per_model }) res = sug_query.execute() suggestions = res.suggest['title'][0] ids = [sug['_id'] for sug in suggestions['options']] query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME) query = query.filter('term', model=model).query('ids', values=ids) else: query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME) query = query.filter('term', model=model) query = query.query('bool', should=[ nested_query_with_advanced_opts( phrase, field, lang, op, suffix) for field in ('title', 'notes') ]) query = query.extra(size=per_model) ms = ms.add(query) return ms
async def get(self): """Get the results from Elasticsearch.""" q = self.request.query.get("q") if not q: return web.json_response([]) es = Elasticsearch( hosts=[self.request.app["settings"].ELASTICSEARCH_URL], timeout=ELASTICSEARCH_TIMEOUT, verify_certs=ELASTICSEARCH_VERIFY_CERTS, ) mapping = es.indices.get_mapping(ELASTICSEARCH_INDEX, include_type_name=True) search = Search(index=ELASTICSEARCH_INDEX, using=es) search = search.highlight_options( pre_tags=[PRE_HIGHLIGHT_TAG], post_tags=[POST_HIGHLIGHT_TAG], ) query = self.queries(mapping, q) search = search.query(query) highlights = self.build_highlight( mapping[ELASTICSEARCH_INDEX]["mappings"]["_doc"]["properties"]) for highlight in highlights: search = search.highlight(highlight, type="plain") search = search.extra( from_=0, size=MAX_RESULTS, ) values = [] for hit in search.execute(): hit._d_.pop(META, None) if HIGHLIGHT and hasattr(hit.meta, "highlight"): highlight = hit.meta.highlight query = DictQuery(hit._d_) for key in highlight: path = key.split(".")[:-1] value = highlight[key][0] query.set("/".join(path), value) values.append(query) else: values.append(hit._d_) return web.json_response(values)
def load_filtered_top_associations_search_after(filters, search_after=''): """Retrieves top associations and filter them through the tickable options""" s = Search(using=es, doc_type='associations') s = s.sort('-score', '_uid') s = filter_association_search(s, filters) if search_after != '': search_after = parse_lastel(search_after) print(search_after) s = s.extra(search_after=search_after) s = s[0:25] print(json.dumps(s.to_dict())) result = s.execute() associations = result['hits']['hits'] last_el = result['hits']['hits'][-1]['sort'] # Transformation needed to saveguard url transmition last_el[1] = "-".join(last_el[1].split('#')) return [association['_source'].to_dict() for association in associations], result['hits']['total'], last_el
def execute_query( self, es_search: es_dsl.Search, *, from_: int = 0, size: Optional[int] = None ) -> Dict: if from_ is None: raise ValueError("'from_' must have a value.") response = {"hits": {"hits": [], "total": 0}} if size is None or (from_ + size > search_settings.scan_limit): # tmp_search = es_search.extra(from_=0, size=0) # tmp_response = tmp_search.execute() # tot_hits = tmp_response.hits.total tot_hits = es_search.count() response["hits"]["total"] = tot_hits if size is None: size = tot_hits - from_ if tot_hits < from_: return response if size + from_ <= search_settings.scan_limit: extra_kwargs = {} if from_ is not None: extra_kwargs["from_"] = from_ if size is not None: extra_kwargs["size"] = size if extra_kwargs: es_search = es_search.extra(**extra_kwargs) return es_search.execute().to_dict() else: es_search = es_search.params(preserve_order=True, scroll="5m") # Workaround scan_iter = elasticsearch.helpers.scan( es_search._using, query=es_search.to_dict(), index=es_search._index, doc_type=es_search._get_doc_type(), **es_search._params, ) # scan_iter = es_search.scan() for hit in itertools.islice(scan_iter, from_, from_ + size): response["hits"]["hits"].append(hit) # response["hits"]["hits"].append(hit.to_dict()) return response
def listing(self, system=None, file_path=None, offset=0, limit=100, **kwargs): """Wrap the search result in a BaseFile object for serializtion.""" query = self.construct_query(**kwargs) listing_search = Search() listing_search = listing_search.filter(query).sort( '_index', {'project._exact': { 'order': 'asc', 'unmapped_type': 'keyword' }}, {'created': { 'order': 'desc', 'unmapped_type': 'long' }}) listing_search = listing_search.extra(from_=offset, size=limit) res = listing_search.execute() children = [] for hit in res: try: getattr(hit, 'projectId') children.append(BaseESPublication(**hit.to_dict()).to_file()) except AttributeError: children.append( BaseESPublicationLegacy(**hit.to_dict()).to_file()) result = { 'trail': [{ 'name': '$SEARCH', 'path': '/$SEARCH' }], 'name': '$SEARCH', 'path': '/', 'system': system, 'type': 'dir', 'children': children, 'permissions': 'READ' } return result
def get_document_text_slice(self, slice_count=0, slice_size=1000, slice_id=0): s = Search(using=self.es, index=self.index, doc_type='items').query(Q({"match_all": {}})).params(scroll='5m', size=slice_size) # s = s.extra(slice={"id": work, "max": 1}) s = s.extra(slice={'id': slice_id, 'max': slice_count}) response = s.execute() # print("MIN ID:", min(map(int, ([h['_id'] for h in response.hits.hits])))) # print("MAX ID:", max(map(int, ([h['_id'] for h in response.hits.hits])))) # print("ID COUNT:", len([h['_id'] for h in response.hits.hits])) for document in response: if 'itemText' in document: yield document.meta.id, document['itemText'] else: yield document.meta.id, ''
def get_queryset(self): try: keyword = self.request.GET['keyword'] s = Search(using=es, index='recipe') s.update_from_dict({ 'query': { 'match': { 'name': { 'query': keyword, 'type': 'phrase_prefix', 'slop': 2 } }, } }) s = s.extra(size=1000) results = s.execute() return results except MultiValueDictKeyError: pass
def _get_markets_with_dsl(self, from_date, to_date): # TODO: This could be fixed now as ES has closed the issue: # https://github.com/elastic/elasticsearch-dsl-py/issues/963 s = Search(using='operations', index="deex-*") s = s.extra(size=0) s = s.query('bool', filter = [ Q('term', operation_type=4), Q("range", block_data__block_time={'gte': from_date, 'lte': to_date}) ]) sources = [ { 'base': A('terms', field='operation_history.op_object.fill_price.base.asset_id.keyword') }, { 'quote': A('terms', field='operation_history.op_object.fill_price.quote.asset_id.keyword') } ] # Bug here as 'sources' does not support a list. a = A('composite', sources=sources, size=10000).metric('volume', 'sum', field='operation_history.op_object.fill_price.quote.amount') s.aggs.bucket('pairs', a) response = s.execute()
def get_daily_volume(self, from_date, to_date): s = Search(using='operations', index="deex-*") s = s.extra(size=0) s = s.query('bool', filter = [ Q('term', operation_type=4), Q('range', block_data__block_time={'gte': from_date, 'lte': to_date}), Q('term', operation_history__op_object__fill_price__quote__asset_id__keyword=config.CORE_ASSET_ID) ]) a = A('date_histogram', field='block_data.block_time', interval='1d', format='yyyy-MM-dd') \ .metric('volume', 'sum', field='operation_history.op_object.fill_price.quote.amount') s.aggs.bucket('volume_over_time', a) response = s.execute() daily_volumes = [] for daily_volume in response.aggregations.volume_over_time.buckets: daily_volumes.append({ 'date': daily_volume.key_as_string, 'volume': daily_volume.volume.value }) return daily_volumes
def search_by_query(self, query: Query) -> Search: """ Get Elasticsearch Search instance by given query object :param Query query: query object to construct ES's Search object :return: Search object constructed by given `query` param """ def convert(name): return ESWords.ASC if name == SortOrder.ASC else ESWords.DESC extra_params = dict() sort_by = dict() search = Search(index=self._index, using=self._es_client) q = query.data if q.filter_by is not None: filter_by = self._query_converter.build(q.filter_by) search = search.query(filter_by) if q.offset is not None: extra_params["from_"] = q.offset if q.limit is not None: extra_params["size"] = q.limit if any((i is not None for i in (q.offset, q.limit))): search = search.extra(**extra_params) if q.order_by is not None: string_field_type = q.order_by.kwargs.get("string_field_type") if string_field_type is not None and \ isinstance(q.order_by.field, StringType): sort_by[f"{field_to_str(q.order_by.field)}.{string_field_type}"] = \ {ESWords.ORDER: convert(q.order_by.order)} else: sort_by[field_to_str(q.order_by.field)] = { ESWords.ORDER: convert(q.order_by.order) } search = search.sort(sort_by) return search
def _search_runs( self, experiment_ids: List[str], filter_string: str, run_view_type: str, max_results: int = SEARCH_MAX_RESULTS_DEFAULT, order_by: List[str] = None, page_token: str = None, columns_to_whitelist: List[str] = None) -> Tuple[List[Run], str]: if max_results > 10000: raise MlflowException( "Invalid value for request parameter max_results. It must be at " "most {}, but got value {}".format(10000, max_results), INVALID_PARAMETER_VALUE) stages = LifecycleStage.view_type_to_stages(run_view_type) parsed_filters = SearchUtils.parse_search_filter(filter_string) filter_queries = [ Q("match", experiment_id=experiment_ids[0]), Q("terms", lifecycle_stage=stages) ] filter_queries += self._build_elasticsearch_query(parsed_filters) sort_clauses = self._get_orderby_clauses(order_by) s = Search(index="mlflow-runs").query('bool', filter=filter_queries) s = s.sort(*sort_clauses) if page_token != "" and page_token is not None: s = s.extra(search_after=ast.literal_eval(page_token)) response = s.params(size=max_results).execute() columns_to_whitelist_key_dict = self._build_columns_to_whitelist_key_dict( columns_to_whitelist) runs = [ self._hit_to_mlflow_run(hit, columns_to_whitelist_key_dict) for hit in response ] if len(runs) == max_results: next_page_token = response.hits.hits[-1].sort else: next_page_token = [] return runs, str(next_page_token)
def listing(self, system=None, file_path=None, offset=0, limit=100, **kwargs): """Perform the search and output in a serializable format.""" query = self.construct_query(system, file_path, **kwargs) listing_search = Search() listing_search = listing_search.filter(query).sort( '_index', {'created': { 'order': 'desc', 'unmapped_type': 'long' }}) listing_search = listing_search.extra( from_=offset, size=limit).source(includes=[ 'project.value', 'created', 'projectId', 'users', 'system' ]) res = listing_search.execute() children = [] for hit in res: hit_to_file = BaseESPublication.hit_to_file(hit) children.append(hit_to_file) result = { 'trail': [{ 'name': '$SEARCH', 'path': '/$SEARCH' }], 'name': '$SEARCH', 'path': '/', 'system': system, 'type': 'dir', 'children': children, 'permissions': 'READ' } return result
def listing(self, system, file_path, offset=0, limit=100, **kwargs): """Perform the search and output in a serializable format.""" query = self.construct_query(system, file_path) listing_search = Search() listing_search = listing_search.query(query) listing_search = listing_search.extra(from_=offset, size=limit) res = listing_search.execute() children = [] print res.hits.total if res.hits.total: children = [o.to_dict() for o in res] result = { 'trail': [{'name': '$SEARCH', 'path': '/$SEARCH'}], 'name': '$SEARCH', 'path': '/', 'system': system, 'type': 'dir', 'children': children, 'permissions': 'READ' } return result
def listing(self, system=None, file_path=None, offset=0, limit=100, **kwargs): """Perform the search and output in a serializable format.""" query = self.construct_query(system, file_path, **kwargs) listing_search = Search() listing_search = listing_search.filter(query).sort('_index') listing_search = listing_search.extra(from_=offset, size=limit) res = listing_search.execute() children = [] for hit in res: try: getattr(hit, 'projectId') hit_to_file = BaseESPublication.hit_to_file(hit) children.append(hit_to_file) except AttributeError: children.append( BaseESPublicationLegacy(**hit.to_dict()).to_file()) result = { 'trail': [{ 'name': '$SEARCH', 'path': '/$SEARCH' }], 'name': '$SEARCH', 'path': '/', 'system': system, 'type': 'dir', 'children': children, 'permissions': 'READ' } return result
class LexEsSearch(object): '''Search class for pglex app.''' def __init__(self, query={}, project=None, index_ver=None, using=None): index = 'lex_{:}_{:}-lex'.format(project, index_ver) self.s = Search(using=using, index=index) self.project = project self.index_ver = index_ver self.query = query self.results = {} self.explain = ('explain' in list(query.keys()) and query['explain'] == 'true') self.includes_q = 'q' in list(query.keys()) def build_search(self): if self.includes_q: self.add_q() self.add_filters() self.add_popularity() self.add_paging() self.add_sort() self.add_includes() self.add_explain() def add_q(self): '''String-based query with results returned by relevance.''' q = self.query['q'] wildcarding = '*' in q or '?' in q if wildcarding is True: add_contact_lg = False else: add_contact_lg = True my_search_fields = get_search_fields(self.project, self.index_ver, self.query, add_target_lang=True, add_contact_lang=add_contact_lg) myboosts = boosts sfields = [] for searchfield in my_search_fields: try: sfield, boost = searchfield.split('^') except ValueError: # No '^' in string sfield = searchfield try: boost = myboosts[sfield] except KeyError: boost = '1' if not wildcarding: sfield += '^' + boost else: sfield = sfield.replace('.', '__') sfield = {sfield: {'value': q.lower(), 'boost': boost}} sfields.append(sfield) if wildcarding is True: queries = [] for sfield in sfields: queries.append(Q("wildcard", **sfield)) boolquery = Q('bool', should=queries, minimum_should_match=1) self.s = self.s.query(boolquery) else: self.s = self.s.query('multi_match', query=q, fields=sfields) def add_filters(self): '''Filter queries that entries must match, e.g. part of speech.''' for filt in filter_fields: try: p = self.query[filt] # If filter came in as a query (url) param. #if type(p) == str and filt_pat.match(p): # p = {filt: p} # app.log.debug('jsonp: ' + p) if isinstance(p, list): termtype = 'terms' # pstr = ','.join(p) else: termtype = 'term' # pstr = p #app.log.debug('termtype: ' + termtype + '; filtp: ' + filt + ' -> ' + pstr) self.s = self.s.filter(termtype, **{filt: p}) except KeyError: pass # filter not included in query def add_popularity(self): '''Scale scores by a 'popularity' factor calculated from popcnt field or a random function. A random seed can be provided if a reproducible random sort is required. If paging through a randomized set of results, for example, then use the same seed when retrieving each page set. If no string search is in the query (i.e. only filters are used), then the _score values will be 0.0, in which case multiplying by the factor will have no effect, so the factor value replaces _score instead. This means documents will be scored based only on the popularity factor.''' if self.includes_q: mode = 'multiply' else: mode = 'replace' try: if self.query['pf'] == 'rand': try: randarg = {'seed': self.query['seed'], 'field': '_seq_no'} except KeyError: randarg = {} freqq = Q('function_score', query=self.s.query, random_score=randarg, boost_mode=mode) self.s = self.s.query(freqq) elif self.query['pf'] != '0': freqq = Q('function_score', query=self.s.query, field_value_factor={ 'field': 'popcnt', 'modifier': 'ln1p', 'factor': int(self.query['pf']), 'missing': 1 }, boost_mode=mode) self.s = self.s.query(freqq) except KeyError: pass def add_paging(self): '''Return a page of results. The default is the first 10 entries.''' try: size = int(self.query['size']) except KeyError: size = 10 try: getfrom = int(self.query['from']) except KeyError: getfrom = 0 self.s = self.s[getfrom:getfrom + size] def add_sort(self): sortfld = '_score' sortparams = {'order': 'desc'} try: keys = list(self.query.keys()) assert ('sort' in keys or 'order' in keys or 'sortmode' in keys) if 'sort' in keys: sortfld = self.query['sort'] if 'order' in keys: sortparams['order'] = self.query['order'] else: if sortfld == '_score': sortparams['order'] = 'desc' else: sortparams['order'] = 'asc' if 'sortmode' in keys: sortparams['mode'] = self.query['sortmode'] except AssertionError: pass self.s = self.s.sort({sortfld: sortparams}) def add_includes(self): try: inc_fields = self.query['inc'].split(',') self.s = self.s.source({'include': inc_fields}) except KeyError: self.s = self.s.source(source_fields) def add_explain(self): if self.explain is True: self.s = self.s.extra(explain=True)
def dump_slice(slice_no): s = Search() s = s.extra(slice={"id": slice_no, "max": SLICES}) for d in s.scan(): print(d.meta.id)
def locations_generator(**kwargs): import datetime from geo.models import Area, District, Locality from mainapp.documents import DocumentLocation from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_DOCUMENT_EVAL from elasticsearch_dsl import Search, Q criterion_tm_duos = kwargs['criterion_tm_duos'] # ((tm_1, criterion_id_1)....()...()) for places in (Area, District, Locality): location_level = places.objects.first()._meta.verbose_name if places == Area: print('!!! Parsing Areas ...', datetime.datetime.now()) if places == District: print('!!! Parsing Districts ...', datetime.datetime.now()) else: print('!!! Parsing Localities ...', datetime.datetime.now()) for i, geo in enumerate(places.objects.all()): s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)\ .source(['datetime', 'source', 'text', 'text_lemmatized', 'title', 'text_lemmatized_yandex']) q = Q( 'bool', should=[Q("match_phrase", text_lemmatized=geo.name)] + [Q("match_phrase", text=geo.name)] + [Q("match_phrase", title=geo.name)] + [Q("match_phrase", text_lemmatized_yandex=geo.name)], minimum_should_match=1, ) s = s.query(q) s = s.extra(track_scores=True) print(f'!!! Scans count for {i} geo inside place: ', s.count(), datetime.datetime.now()) scans = s.scan() for scan_obj in scans: document_datetime, document_source = hit_parser(scan_obj) doc = DocumentLocation( document_es_id=scan_obj.meta.id, document_datetime=document_datetime, document_source=document_source, location_name=geo.name, location_level=location_level, location_weight=scan_obj.meta.score, location_id=geo.id, ) for tm, criterion_id in criterion_tm_duos: ev_docs = Search(using=ES_CLIENT, index=f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion_id}") \ .filter("term", document_es_id=scan_obj.meta.id) \ .source(['value', 'document_datetime', 'document_source']) \ .execute() if not ev_docs: continue ev_docs = ev_docs[0] value = ev_docs.value if hasattr(ev_docs, "value") and ev_docs.value else None doc[f'criterion_{tm}_{criterion_id}'] = value yield doc
def search(search_params, index, page_size, ip, page=1) -> Response: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `~cccatalog.api.search_serializers.SearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param page: The results page number. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :return: An Elasticsearch Response object. """ s = Search(index=index) # Paginate search query. start_slice = page_size * (page - 1) end_slice = page_size * page if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW: raise ValueError("Deep pagination is not allowed.") s = s[start_slice:end_slice] # If any filters are specified, add them to the query. if 'li' in search_params.data or 'lt' in search_params.data: license_field = 'li' if 'li' in search_params.data else 'lt' license_filters = [] for _license in search_params.data[license_field].split(','): license_filters.append(Q('term', license__keyword=_license)) s = s.filter('bool', should=license_filters, minimum_should_match=1) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. if 'q' in search_params.data: s = s.query('constant_score', filter=Q( 'query_string', query=search_params.data['q'], fields=['tags.name', 'title'], )) else: if 'creator' in search_params.data: creator = search_params.data['creator'] s = s.query('constant_score', filter=Q('query_string', query=creator, default_field='creator')) if 'title' in search_params.data: title = search_params.data['title'] s = s.query('constant_score', filter=Q('query_string', query=title, default_field='title')) if 'tags' in search_params.data: tags = search_params.data['tags'] s = s.query('constant_score', filter=Q('query_string', default_field='tags.name', query=tags)) s.extra(track_scores=True) s = s.params(preference=str(ip)) search_response = s.execute() return search_response
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Add requested filters. if 'li' in search_params.data: s = _filter_licenses(s, search_params.data['li']) elif 'lt' in search_params.data: s = _filter_licenses(s, search_params.data['lt']) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) if 'extension' in search_params.data: extension = search_params.data['extension'] extension_filter = Q('term', extension=extension) s = s.filter('bool', should=extension_filter, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set( key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers ) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query( 'query_string', query=query, fields=search_fields, type='most_fields' ) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query( 'query_string', query=creator, default_field='creator' ) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query( 'query_string', query=title, default_field='title' ) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query( 'query_string', default_field='tags.name', query=tags ) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip)) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] search_response = s.execute() results = _post_process_results( s, start, end, page_size, search_response, request, filter_dead ) result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) return results, page_count, result_count
def apply_paging(self, catalog: CatalogName, es_search: Search, pagination: Pagination, peek_ahead: bool = True) -> Search: """ Set sorting and paging parameters for the given ES search request. :param catalog: The name of the catalog to search in :param es_search: The Elasticsearch request object :param pagination: The sorting and paging settings to apply :param peek_ahead: If True, request one more hit so that _generate_paging_dict can know if there is another page. Use this to prevent a last page that's empty. """ sort_field = pagination.sort + '.keyword' sort_order = pagination.order field_type = self.field_type(catalog, tuple(pagination.sort.split('.'))) sort_mode = field_type.es_sort_mode def sort(order): assert order in ('asc', 'desc'), order return ( { sort_field: { 'order': order, 'mode': sort_mode, 'missing': '_last' if order == 'asc' else '_first', **({} if field_type.es_type is None else { 'unmapped_type': field_type.es_type }) } }, # This secondary sort field serves as the tie breaker for when # the primary sort field is not unique across documents. # Otherwise it's redundant, especially its the same as the # primary sort field. However, always having a secondary # simplifies the code and most real-world use cases use sort # fields that are not unique. { 'entity_id.keyword': { 'order': order } }) # Using search_after/search_before pagination if pagination.search_after is not None: es_search = es_search.extra(search_after=pagination.search_after) es_search = es_search.sort(*sort(sort_order)) elif pagination.search_before is not None: es_search = es_search.extra(search_after=pagination.search_before) rev_order = 'asc' if sort_order == 'desc' else 'desc' es_search = es_search.sort(*sort(rev_order)) else: es_search = es_search.sort(*sort(sort_order)) # FIXME: Remove this or change to 10000 (the default) # https://github.com/DataBiosphere/azul/issues/3770 es_search = es_search.extra(track_total_hits=True) assert isinstance(peek_ahead, bool), type(peek_ahead) # fetch one more than needed to see if there's a "next page". es_search = es_search.extra(size=pagination.size + peek_ahead) return es_search
def make_query(q, lon=None, lat=None, match_all=True, limit=15, filters=None): if filters is None: filters = {} s = Search(es).index(INDEX) should_match = '100%' if match_all else '2<-1 6<-2 8<-3 10<-50%' match = Q( 'bool', must=[Q('match', collector={ 'fuzziness': 1, 'prefix_length': 2, 'query': q, 'minimum_should_match': should_match, 'analyzer': 'search_stringanalyzer' })], should=[ Q('match', **{'name.keywords': { 'query': q, 'boost': 2, 'analyzer': 'search_stringanalyzer' }}), Q('match', **{'street.keywords': { 'query': q, 'boost': 2, 'analyzer': 'search_stringanalyzer' }}), Q('match', **{'city.default': { 'query': q, 'boost': 2, 'analyzer': 'search_stringanalyzer' }}), Q('match', **{'way_label': { 'query': q, 'boost': 2, 'analyzer': 'search_stringanalyzer' }}), Q('match', **{'housenumber': { 'query': q, 'boost': 2, 'analyzer': 'housenumber_analyzer' }}), ] ) functions = [{ "script_score": { "script": "1 + doc['importance'].value * 40", "lang": "groovy" } }] if lon is not None and lat is not None: functions.append({ "script_score": { "script": "dist = doc['coordinate'].distanceInKm(lat, lon); 1 / (0.5 - 0.5 * exp(-5*dist/maxDist))", "lang": "groovy", "params": { "lon": lon, "lat": lat, "maxDist": 100 } } }) fscore = Q( 'function_score', score_mode="multiply", boost_mode="multiply", query=match, functions=functions ) s = s.query(fscore) # Only filter out 'house' if we are not explicitly asking for this # type. if filters.get('type') is not 'housenumber': # We don't want results with an ordinal (bis, ter…) if the ordinal # field itself doesn't match filter_ordinal = F('or', [ F('missing', field="ordinal"), F({"query": {"match": {"ordinal": {"query": q, "analyzer": "housenumber_analyzer"}}}}), ]) house_query = Filtered(query=Match(housenumber={"query": q, "analyzer": "housenumber_analyzer"}), filter=filter_ordinal) filter_house = F('or', [ F('missing', field="housenumber"), F('exists', field="name.keywords"), F({'query': house_query.to_dict()}), ]) s = s.filter(filter_house) if filters: # We are not using real filters here, because filters are not analyzed, # so for example "city=Chauny" will not match, because "chauny" is in # the index instead. for k, v in filters.items(): s = s.query({'match': {k: v}}) return s.extra(size=limit)
def get_context_data(self, **kwargs): context = super(RecipeDetailView, self).get_context_data(**kwargs) recipe = self.get_queryset().all()[0] # increment count if not self.request.session.get('recipe_viewed_%s' % recipe.pk, None): recipe.increment_views() self.request.session['recipe_viewed_%s' % recipe.pk] = 1 logger.error('recipe views %s', recipe.views) logger.error('recipe session %s', self.request.session.keys()) course_info = recipe.courses.all() holiday_info = recipe.holidays.all() context['title'] = context['recipe'].name s = Search(using=es, index='recipe') exclude_clause = [] match_clause = [] exclude_clause.append( {"term": {"document_id": recipe.id}} ) if course_info: course_id = course_info[0].id match_clause.append({'match': {'courses': {'query': course_id, 'boost': 5}}}) if holiday_info: holiday_id = holiday_info[0].id match_clause.append({'match': {'holidays': holiday_id}}) match_clause.append({'match': {'name': {'query': recipe.name, 'boost': 2}}}) s = Search(using=es, index='recipe') s.update_from_dict({ 'query': { 'function_score': { 'query': { 'bool': { "must_not": exclude_clause, 'should': match_clause } }, 'random_score': { 'seed': 12371203 } } } }) s = s.extra(size=6) results = s.execute() context['suggested_recipes'] = results if self.request.user.is_authenticated(): user_collection = UserCollection.objects.filter(user=self.request.user) recipe_collection = RecipesCollection.objects\ .filter(recipe_id=context['recipe'].id, collection__user=self.request.user)\ .only('collection__id') user_recipe_collection = set(i.collection_id for i in recipe_collection.all()) initial = { 'recipes': self.kwargs.get("pk"), 'recipe_collection': user_collection, 'user_recipe_collection': user_recipe_collection } context['form'] = UserRecipeCollectionForm( initial=initial ) context['searchform'] = SearchKeywordForm() context['current_recipe_name'] = recipe.name return context
def executeTermQuery(self, t, user_terminology, query_type): size = self.query_size_full if (query_type == "fullmatch"): q1 = Q({ "multi_match": { "query": t, "fuzziness": 0, "fields": [ "name.fullmatch_exact^" + self.field_boost, "name.fullmatch_folding" ] } }) elif (query_type == "fuzzy_fullmatch"): q_a = Q({ "multi_match": { "query": t, "fuzziness": 1, "prefix_length": self.prefix_length, "fields": [ "name.fullmatch_exact^" + self.field_boost, "name.fullmatch_folding" ] } }) q_b = Q({ "multi_match": { "query": t, "fuzziness": "AUTO", "prefix_length": self.prefix_length, "fields": [ "name.fullmatch_exact^" + self.field_boost, "name.fullmatch_folding" ] } }) q1 = Q('bool', should=[q_a, q_b]) else: size = self.query_size_shingle q1 = Q({ "multi_match": { "query": t, "fuzziness": 0, "fields": [ "name.shinglematch_exact^" + self.field_boost, "name.shinglematch_folding" ] } }) qFilter = Q('terms', terminology_id=list(self.terminologies_dict.keys())) shoud_clause = [] if user_terminology is not None: # limit results to terminologies related to specific domain(s) qShould1 = Q('constant_score', filter=Q('terms', terminology_id=user_terminology), boost=20) q = Q('bool', must=[q1], should=[qShould1], filter=[qFilter]) else: for k, v in self.terminologies_dict.items(): shoud_clause.append( Q('constant_score', filter=Q('term', terminology_id=k), boost=v)) #qShould_q = Q('constant_score', filter=Q('term', terminology_id=13), boost=self.quantity_terminology_boost) #added 21-02-2020 boost by quantity #qShould1 = Q('constant_score', filter=Q('terms',terminology_id=self.primary_terminology), boost=self.primary_terminology_boost) #qShould2 = Q('constant_score', filter=Q('terms', terminology_id=self.secondary_terminologies), boost=self.second_terminology_boost) q = Q('bool', must=[q1], should=shoud_clause, filter=[qFilter]) #print(q.to_dict()) s = Search(using=self.elasticSearchInst, index=self.elastic_index, doc_type=self.elastic_doctype).query(q) s = s.extra(size=size) response = s.execute() list_res = [] return_val = [] if response: response = response.to_dict() #print("%d documents found" % response ['hits']['total']) for hit in response['hits']['hits']: dictres = { "id": int(hit['_id']), "name": hit['_source']['name'], "abbreviation": hit['_source']['abbreviation'], "score": hit['_score'], "terminology": hit['_source']['terminology'] } if 'description_uri' in hit['_source']: dictres['description_uri'] = hit['_source'][ 'description_uri'] if 'topics' in hit['_source']: dictres['topics'] = hit['_source']['topics'] list_res.append(dictres) if list_res: if query_type == "shinglematch": #2020-03-05 do not apply max score filter for shingle match fragment_vector = self.tokenize_string( t ) #Counter({'temperature': 1, 'sea': 1, 'surface': 1}) #print('fragment_vector ',fragment_vector) list_ids = [str(d['id']) for d in list_res] tokenized_terms_dict = self.tokenize_by_ids(list_ids) #print(tokenized_terms_dict) list_ids_tuples = self.generateCombinationsByTermIds( list_ids, len(t.split())) final_ids = self.compute_cosine_sim( tokenized_terms_dict, list_ids_tuples, fragment_vector) #remove the records not in final_ids return_val = [d for d in list_res if d['id'] in final_ids] else: #return_val = [d for d in list_res if d['score'] == max_score] #27-02-2020 for full and fuzzy match return term with max score (for duplicate terms only) list_names = [d['name'] for d in list_res] # dont chnage to set duplicates = { item for item, count in Counter(list_names).items() if count > 1 } remove_ids = [] for dup in duplicates: mx = max( {d['score'] for d in list_res if d['name'] == dup}) remove_ids.extend({ d['id'] for d in list_res if d['name'] == dup and d['score'] < mx }) return_val = [ d for d in list_res if d['id'] not in remove_ids ] return return_val
def build_search_object(q=None, filters=None, after=None, offset=None, limit=None, fields=None, facets=None, facet_limits=None, sort=None, **kwargs): ''' Given the parameters, creates a new elasticsearch-dsl Search object and returns it. :param q: a query string which will be searched against the meta.all field or a dict of fields and search values. If this is a dict then the keys (field names) are always prefixed with "data." unless the key is an empty string in which case the field uses is meta.all. This allows combination searches across meta.all and data.* fields. :param filters: a dict of fields and values to filter the result with. If a key is present that is equal to "__geo__" then the value associated with it should be a dict which will be treated as a geo query to be run against the `meta.geo` field. The value should contain a "type" key which must have a corresponding value of "point", "box" or "polygon" and then other keys that are dependant on the type: - point: - distance: the radius of the circle centred on the specified location within which records must lie to be matched. This can specified in any form that elasticsearch accepts for distances (see their doc, but values like 10km etc). - point: the point to centre the radius on, specified as a lat, long pair in a list (i.e. [-20, 40.2]). - box: - points: the top left and bottom right points of the box, specified as a list of two lat/long pairs (i.e. [[-20, 40.2], [0.5, 100]]). - polygon: - points: a list of at least 3 lat/long pairs (i.e. [[-16, 44], [-13.1, 34.8], [15.99, 35], [5, 49]]). :param after: the search after value to start the search result from (for pagination). Cannot be used in conjunction with offset. If both offset and after are provided then after is used and offset is ignored. :param offset: the offset to start the search result from (for pagination) :param limit: the limit to stop the search result at (for pagination) :param fields: a list of field names to return in the result :param facets: a list of field names to return an aggregation of top 10 values and counts for :param facet_limits: a dict of fields and their customised top n limits :param sort: a list of fields to sort by with ordering. By default the fields are sorted ascending, but by providing "desc" after the field name a descending sort will be used. An ascending sort on _id is always added unless included in this list. This is to ensure there is a unique tie-breaking field which is useful for ensuring results stay the same each time they are requested and necessary to ensure the correct result list responses when using the after parameter for pagination. :param kwargs: as a convenience we allow a kwargs parameter which we ignore, this is useful to as it allows the arguments to be passed by just unpacking the data_dict :return: an elasticsearch-dsl Search object ''' search = Search() # add a free text query across all fields if there is one. This searches against meta.all which # is a copy field created by adding the values of each data.* field if q is not None and q is not u'' and q is not {}: if isinstance(q, (str, unicode, int, float)): search = search.query( u'match', **{u'meta.all': { u'query': q, u'operator': u'and' }}) elif isinstance(q, dict): for field, query in sorted(q.items(), key=operator.itemgetter(0)): # TODO: change this to __all__ to match __geo__? if field == u'': field = u'meta.all' else: field = prefix_field(field) search = search.query( u'match', **{field: { u'query': query, u'operator': u'and' }}) if filters is not None: for field, values in sorted(filters.items(), key=operator.itemgetter(0)): if not isinstance(values, list): values = [values] if field == u'__geo__': # only pass through the first value search = add_geo_search(search, values[0]) else: field = u'{}'.format(prefix_field(field)) for value in values: # filter on the keyword version of the field search = search.filter(u'term', **{field: value}) # after and offset cannot be used together, prefer after over offset if after is not None: search = search.extra(search_after=after) elif offset is not None: search = search.extra(from_=int(offset)) # add the limit or a default of 100 if there isn't one specified search = search.extra(size=int(limit) if limit is not None else 100) if fields is not None: search = search.source(map(prefix_field, fields)) if facets is not None: facet_limits = facet_limits if facet_limits is not None else {} for facet in facets: # to produce the facet counts we use a bucket terms aggregation, note that using the # bucket function on the top level aggs attribute on the search object doesn't return a # copy of the search object like it does when adding queries etc search.aggs.bucket(facet, u'terms', field=prefix_field(facet), size=facet_limits.get(facet, 10)) # at least one sort is always added, on the _id column. This is necessary to ensure use of that # search_after is predictable (in the elasticsearch docs it recommends that a tie-breaker field # is present otherwise the response can include duplicates/missing records). The _id field is # always unique and therefore an ideal tie-breaker, so we make sure it's always in the sort sorts = [] # if the caller passes in _id then we don't need to add it in again id_in_sort = False if sort is not None: for field_and_sort in sort: if not field_and_sort.endswith( u' desc') and not field_and_sort.endswith(u' asc'): # default the sort direction to ascending if nothing is provided field_and_sort += u' asc' field, direction = field_and_sort.rsplit(u' ', 1) # set the id_in_sort boolean to True if we see the _id field in the caller defined sort id_in_sort = not id_in_sort and field == u'_id' field = prefix_field(field) # if the sort direction is desc we need to add a minus sign in front of the field name, # otherwise we can just use the field name on its own as the default sort is asc sorts.append(u'-{}'.format(field) if direction == u'desc' else field) # by default, sort by the _id field if not id_in_sort: sorts.append(prefix_field(u'_id')) search = search.sort(*sorts) return search
def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits: search = Search(using=es_client, index=settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX) if query.collapse_key: search = search.filter("term", collapse_key=query.collapse_key) else: search = search.extra( collapse={ "field": "collapse_key", "inner_hits": { "name": "more_pages", "size": 0, }, }) # apply filters from query search = apply_filters(search, query) # we combined several queries to improve scoring. # this query use the fancy built-in query string parser basic_fulltext = Q( "query_string", query=query.q, default_operator="AND", analyze_wildcard=True, allow_leading_wildcard=False, lenient=True, quote_field_suffix=".exact", fields=[ "title^4", "biblio_all^3", "everything", ], ) has_fulltext = Q("terms", **{"access_type": ["ia_sim", "ia_file", "wayback"]}) poor_metadata = Q( "bool", should=[ # if these fields aren't set, metadata is poor. The more that do # not exist, the stronger the signal. Q("bool", must_not=Q("exists", field="year")), Q("bool", must_not=Q("exists", field="type")), Q("bool", must_not=Q("exists", field="stage")), Q("bool", must_not=Q("exists", field="biblio.container_name")), ], ) if query.filter_availability == "fulltext" or query.filter_availability is None: base_query = basic_fulltext else: base_query = Q("bool", must=basic_fulltext, should=[has_fulltext]) if query.q == "*": search = search.query("match_all") search = search.sort("_doc") else: search = search.query( "boosting", positive=base_query, negative=poor_metadata, negative_boost=0.5, ) # simplified version of basic_fulltext query, for highlighting highlight_query = Q( "query_string", query=query.q, default_operator="AND", lenient=True, ) search = search.highlight( "abstracts.body", "fulltext.body", "fulltext.acknowledgement", "fulltext.annex", highlight_query=highlight_query.to_dict(), require_field_match=False, number_of_fragments=2, fragment_size=200, order="score", # TODO: this will fix highlight encoding, but requires ES 7.x # encoder="html", ) # sort order if query.sort_order == "time_asc": search = search.sort("year", "date") elif query.sort_order == "time_desc": search = search.sort("-year", "-date") elif query.sort_order == "relevancy" or query.sort_order is None: pass else: raise ValueError( f"Unknown 'sort_order' parameter value: '{query.sort_order}'") # Sanity checks limit = min((int(query.limit or 15), 100)) offset = max((int(query.offset or 0), 0)) if offset > deep_page_limit: # Avoid deep paging problem. offset = deep_page_limit search = search.params(track_total_hits=True) search = search[offset:(offset + limit)] query_start = datetime.datetime.now() try: resp = search.execute() except elasticsearch.exceptions.RequestError as e_raw: # this is a "user" error e: Any = e_raw logging.warn("elasticsearch 400: " + str(e.info)) if e.info.get("error", {}).get("root_cause", {}): raise ValueError( str(e.info["error"]["root_cause"][0].get("reason"))) from e else: raise ValueError(str(e.info)) from e except elasticsearch.exceptions.TransportError as e: # all other errors logging.warn("elasticsearch non-200 status code: {}".format(e.info)) raise IOError(str(e.info)) from e query_delta = datetime.datetime.now() - query_start # convert from API objects to dicts results = transform_es_results(resp) count_found: int = 0 if isinstance(resp.hits.total, int): count_found = int(resp.hits.total) else: count_found = int(resp.hits.total["value"]) count_returned = len(results) # if we grouped to less than a page of hits, update returned count if (not query.collapse_key) and offset == 0 and (count_returned < limit): count_found = count_returned return FulltextHits( query_type="fulltext", count_returned=count_returned, count_found=count_found, offset=offset, limit=limit, deep_page_limit=deep_page_limit, query_time_ms=int(resp.took), query_wall_time_ms=int(query_delta.total_seconds() * 1000), results=results, )
class ElasticsearchSearch(ABC): """Representa una búsqueda a realizar utilizando Elasticsearch. Dependiendo de los parámetros de búsqueda, se puede llegar a necesitar más de una consulta a Elasticsearch para completar la misma. Attributes: _search (elasticsearch_dsl.Search): Búsqueda principal a envíar a Elasticsearch. _index (str): Índice sobre el cual realizar la búsqueda principal. _offset (int): Cantidad de resultados a saltear ('from'). _result (ElasticsearchResult): Resultado de la búsqueda. """ __slots__ = ['_search', '_index', '_offset', '_result'] def __init__(self, index, query): """Inicializa un objeto de tipo ElasticsearchSearch. Args: index (str): Ver atributo '_index'. query (dict): Parámetros de la búsqueda. Ver el método '_read_query' para tomar nota de los valores permitidos dentro del diccionario. """ self._search = Search(index=index) if constants.ES_TRACK_TOTAL_HITS: # Configurar la cantidad máxima de hits con los que se pueden # calcular total de hits precisos (nuevo en Elasticsearch 7.0.0). self._search = self._search.extra( track_total_hits=constants.ES_TRACK_TOTAL_HITS) self._index = index self._offset = query.get('offset', 0) self._result = None self._read_query(**query) @abstractmethod def search_steps(self): """Devuelve un iterador de búsquedas elasticsearch_dsl.Search, cada una representando un paso requerido para completar la búsqueda ElasticsearchSearch. Cuando el iterador finaliza, el valor de 'self._result' contiene el resultado final de la búsqueda. Yields: elasticsearch_dsl.Search: Búsqueda DSL que se desea ejecutar. Sus resultados deberían ser devueltos por el invocador de 'next()/send()'. """ raise NotImplementedError() def _read_query(self, fields=None, size=constants.DEFAULT_SEARCH_SIZE, offset=0): """Lee los parámetros de búsqueda recibidos y los agrega al atributo 'self._search'. Args: fields (list): Lista de campos a incluir en los resultados de la búsqueda. size (int): Tamaño máximo de resultados a devolver. offset (int): Cantidad de resultados a saltear. """ if fields: self._search = self._search.source(includes=fields) self._search = self._search[offset:offset + size] def _expand_intersection_query(self, geo_shape_ids): """Expande (comprueba) que los IDs contenidos en geo_shape_ids sean válidos y referencien a entidades existentes. Los IDs inválidos son removidos. Este paso es necesario ya que la búsqueda por geometrías pre-indexadas de Elasticsearch no acepta IDs de documentos no existentes. Si se intenta utilizar un ID inválido, retorna HTTP 400. Para realizar la búsqueda, se retorna un iterador de elasticsearch_dsl.Search. De esta forma, se puede utilizar este método desde 'search_steps', agregando instancias de elasticsearch_dsl.Search que deben ser ejecutadas para completar los resultados de la instancia de ElasticsearchSearch. Yields: elasticsearch_dsl.Search: Búsqueda DSL necesaria para completar el chequeo de IDs. Args: geo_shape_ids (dict): Diccionario de str - list, las keys siendo tipos de entidades, y los valores siendo listas de IDs para el tipo de entidad. """ checked_ids = {} for entity_type in INTERSECTION_PARAM_TYPES: if entity_type not in geo_shape_ids: continue entity_ids = list(geo_shape_ids[entity_type]) search_class = entity_search_class(entity_type) search = search_class({ 'ids': entity_ids, 'size': len(entity_ids), 'fields': [N.ID] }) yield from search.search_steps() checked_ids[entity_type] = [ hit[N.ID] for hit in search.result.hits ] self._search = self._search.query( _build_geo_query(N.GEOM, ids=checked_ids)) def _expand_geometry_query(self, search_class): """Expande (completa) una búsqueda que incluye 'geometria' en sus campos. Para lograr esto, crea búsquedas elasticsearch_dsl.Search a los índices correspondientes que incluyen geometrías. Este método es necesario ya que los índices de entidades no cuentan con las versiones originales de las geometrías, por razones de performance (ver comentario en archivo es_config.py). Entonces, es necesario buscar las geometrías en índices separados, utilizando los IDs de los resultados encontrados en la búsqueda principal ('self._search'). Para realizar la búsqueda de geometrías, se retorna un iterador de elasticsearch_dsl.Search. De esta forma, se puede utilizar este método desde 'search_steps', agregando instancias de elasticsearch_dsl.Search que deben ser ejecutadas para completar los resultados de la instancia de ElasticsearchSearch. Args: search_class (type): Clase a utilizar para crear el iterador de búsquedas. Yields: elasticsearch_dsl.Search: Búsqueda DSL necesaria para obtener las geometrías. """ ids = [hit['id'] for hit in self._result.hits] geom_search = search_class({ 'ids': ids, 'fields': [N.ID, N.GEOM], 'size': len(ids) }) yield from geom_search.search_steps() original_hits = {hit[N.ID]: hit for hit in self._result.hits} for hit in geom_search.result.hits: # Agregar campo geometría a los resultados originales original_hits[hit[N.ID]][N.GEOM] = hit[N.GEOM] @property def result(self): """Devuelve el resultado de la búsqueda, si esta fue ejecutada. Raises: RuntimeError: Si la búsqueda no fue ejecutada. Returns: ElasticsearchResult: Resultado de la búsqueda. """ if self._result is None: raise RuntimeError('Search has not been executed yet') return self._result @staticmethod def run_searches(es, searches): """Ejecuta una lista de búsquedas ElasticsearchSearch. Para ejecutar las búsquedas, se obtiene un iterador de búsquedas elasticsearch_dsl.Search por cada elemento de 'searches'. Utilizando los iteradores, se construyen listas de elasticsearch_dsl.Search, que son luego ejecutadas utilizando '_run_multisearch'. Después, los resultados son devueltos a cada iterador, que pueden o no generar una nueva búsqueda elasticsearch_dsl.Search. El proceso se repite hasta que todos los iteradores hayan finalizado. Con todo este proceso se logra: 1) Ejecutar cualquier tipo de búsquedas bajo una mismas interfaz. 2) Ejecutar búsquedas que requieren distintas cantides de pasos bajo una misma interfaz. 3) Utilizar la funcionalidad de MultiSearch para hacer la menor cantidad de consultas posible a Elasticsearch. Los resultados de cada búsqueda pueden ser accedidos vía el campo '.result' de cada una. Args: es (Elasticsearch): Conexión a Elasticsearch. searches (list): Lista de búsquedas ElasticsearchSearch o derivados. La lista puede ser de cualquier largo ya que sus contenidos son fraccionados por '_run_multisearch' para evitar consultas demasiado extensas a Elasticsearch. """ iterators = [search.search_steps() for search in searches] iteration_data = [] for iterator in iterators: search = utils.step_iterator(iterator) if search: iteration_data.append((iterator, search)) while iteration_data: responses = _run_multisearch( es, [search for _, search in iteration_data]) iterators = (iterator for iterator, _ in iteration_data) iteration_data = [] for iterator, response in zip(iterators, responses): search = utils.step_iterator(iterator, response) if search: iteration_data.append((iterator, search))
class Query(): """ Base query class used to query elasticsearch """ filters = {} start_date = None end_date = None interval_ = "month" offset_ = None def __init__(self, index, esfilters={}, interval=None, offset=None): """ :param index: An Index object containing the connection details :param esfilters: TODO: this is still to be implemented :param interval: interval to use for timeseries data :param offset: TODO: this is still to be implemented """ self.index = index self.search = Search(using=self.index.es, index=self.index.index_name) self.parent_agg_counter = 0 if esfilters: self.filters.update(esfilters) # an ordered aggregation dict so that the nested aggregations can be made chainable self.aggregations = OrderedDict() self.child_agg_counter_dict = defaultdict( int) # to keep a track of nested child aggregations self.size = 10000 # temporary hack to get all the data self.precision_threshold = 3000 # accuracy that we want when counting the number of items if interval: self.interval_ = interval if offset: self.offset_ = offset def add_query(self, key_val={}): """ Add an es_dsl query object to the es_dsl Search object :param key_val: a key-value pair(dict) containing the query to be added to the search object :returns: self, which allows the method to be chainable with the other methods """ q = Q("match", **key_val) self.search = self.search.query(q) return self def add_inverse_query(self, key_val={}): """ Add an es_dsl inverse query object to the es_dsl Search object :param key_val: a key-value pair(dict) containing the query to be added to the search object :returns: self, which allows the method to be chainable with the other methods """ q = Q("match", **key_val) self.search = self.search.query(~q) return self def is_open(self): """ Add the {'state':'open'} query to the Search object :returns: self, which allows the method to be chainable with the other methods """ self.add_query({"state": "open"}) return self def is_closed(self): """ Add the {'state':'closed'} query to the Search object :returns: self, which allows the method to be chainable with the other methods """ self.add_query({"state": "closed"}) return self def get_sum(self, field=None): """ Create a sum aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("sum", field=field) self.aggregations['sum_' + field] = agg return self def get_average(self, field=None): """ Create an avg aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("avg", field=field) self.aggregations['avg_' + field] = agg return self def get_percentiles(self, field=None, percents=None): """ Create a percentile aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :param percents: the specific percentiles to be calculated default: [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0] :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") if not percents: percents = [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0] agg = A("percentiles", field=field, percents=percents) self.aggregations['percentiles_' + field] = agg return self def get_terms(self, field=None): """ Create a terms aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("terms", field=field, size=self.size, order={"_count": "desc"}) self.aggregations['terms_' + field] = agg return self def get_min(self, field=None): """ Create a min aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("min", field=field) self.aggregations['min_' + field] = agg return self def get_max(self, field=None): """ Create a max aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("max", field=field) self.aggregations['max_' + field] = agg return self def get_cardinality(self, field=None): """ Create a cardinality aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("cardinality", field=field, precision_threshold=self.precision_threshold) self.aggregations['cardinality_' + field] = agg return self def get_extended_stats(self, field=None): """ Create an extended_stats aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("extended_stats", field=field) self.aggregations['extended_stats_' + field] = agg return self def add_custom_aggregation(self, agg, name=None): """ Takes in an es_dsl Aggregation object and adds it to the aggregation dict. Can be used to add custom aggregations such as moving averages :param agg: aggregation to be added to the es_dsl search object :param name: name of the aggregation object (optional) :returns: self, which allows the method to be chainable with the other methods """ agg_name = name if name else 'custom_agg' self.aggregations[agg_name] = agg return self def since(self, start, field=None): """ Add the start date to query data starting from that date sets the default start date for each query :param start: date to start looking at the fields (from date) :param field: specific field for the start date in range filter for the Search object :returns: self, which allows the method to be chainable with the other methods """ if not field: field = "grimoire_creation_date" self.start_date = start date_dict = {field: {"gte": "{}".format(self.start_date.isoformat())}} self.search = self.search.filter("range", **date_dict) return self def until(self, end, field=None): """ Add the end date to query data upto that date sets the default end date for each query :param end: date to stop looking at the fields (to date) :param field: specific field for the end date in range filter for the Search object :returns: self, which allows the method to be chainable with the other methods """ if not field: field = "grimoire_creation_date" self.end_date = end date_dict = {field: {"lte": "{}".format(self.end_date.isoformat())}} self.search = self.search.filter("range", **date_dict) return self def by_authors(self, field=None): """ Used to seggregate the data with respect to the users. This method pops the latest aggregation from the self.aggregations dict and adds it as a nested aggregation under itself :param field: the field to create the parent agg (optional) default: author_uuid :returns: self, which allows the method to be chainable with the other methods """ # Parent aggregation agg_field = field if field else "author_uuid" agg_key = "terms_" + agg_field if agg_key in self.aggregations.keys(): agg = self.aggregations[agg_key] else: agg = A("terms", field=agg_field, missing="others", size=self.size) child_agg_counter = self.child_agg_counter_dict[ agg_key] # 0 if not present because defaultdict child_name, child_agg = self.aggregations.popitem() # add child agg to parent agg agg.metric(child_agg_counter, child_agg) # insert this agg to the agg dict. This agg essentially replaces # the last agg that was in the agg dict self.aggregations[agg_key] = agg self.child_agg_counter_dict[agg_key] += 1 return self def by_organizations(self, field=None): """ Used to seggregate the data acording to organizations. This method pops the latest aggregation from the self.aggregations dict and adds it as a nested aggregation under itself :param field: the field to create the parent agg (optional) default: author_org_name :returns: self, which allows the method to be chainable with the other methods """ # this functions is currently only for issues and PRs agg_field = field if field else "author_org_name" agg_key = "terms_" + agg_field if agg_key in self.aggregations.keys(): agg = self.aggregations[agg_key] else: agg = A("terms", field=agg_field, missing="others", size=self.size) child_agg_counter = self.child_agg_counter_dict[ agg_key] # 0 if not present because defaultdict child_name, child_agg = self.aggregations.popitem() agg.metric(child_agg_counter, child_agg) self.aggregations[agg_key] = agg self.child_agg_counter_dict[agg_key] += 1 return self def by_period(self, field=None, period=None, timezone=None, start=None, end=None): """ Create a date histogram aggregation using the last added aggregation for the current object. Add this date_histogram aggregation into self.aggregations :param field: the index field to create the histogram from :param period: the interval which elasticsearch supports, ex: "month", "week" and such :param timezone: custom timezone :param start: custom start date for the date histogram, default: start date under range :param end: custom end date for the date histogram, default: end date under range :returns: self, which allows the method to be chainable with the other methods """ hist_period = period if period else self.interval_ time_zone = timezone if timezone else "UTC" start_ = start if start else self.start_date end_ = end if end else self.end_date bounds = self.get_bounds(start_, end_) date_field = field if field else "grimoire_creation_date" agg_key = "date_histogram_" + date_field if agg_key in self.aggregations.keys(): agg = self.aggregations[agg_key] else: agg = A("date_histogram", field=date_field, interval=hist_period, time_zone=time_zone, min_doc_count=0, **bounds) child_agg_counter = self.child_agg_counter_dict[agg_key] child_name, child_agg = self.aggregations.popitem() agg.metric(child_agg_counter, child_agg) self.aggregations[agg_key] = agg self.child_agg_counter_dict[agg_key] += 1 return self def get_bounds(self, start=None, end=None): """ Get bounds for the date_histogram method :param start: start date to set the extended_bounds min field :param end: end date to set the extended_bounds max field :returns bounds: a dictionary containing the min and max fields required to set the bounds in date_histogram aggregation """ bounds = {} if start or end: # Extend bounds so we have data until start and end start_ts = None end_ts = None if start: start = start.replace(microsecond=0) start_ts = start.replace(tzinfo=timezone.utc).timestamp() start_ts_ms = start_ts * 1000 # ES uses ms if end: end = end.replace(microsecond=0) end_ts = end.replace(tzinfo=timezone.utc).timestamp() end_ts_ms = end_ts * 1000 # ES uses ms bounds_data = {} if start: bounds_data["min"] = start_ts_ms if end: bounds_data["max"] = end_ts_ms bounds["extended_bounds"] = bounds_data return bounds def reset_aggregations(self): """ Remove all aggregations added to the search object """ temp_search = self.search.to_dict() if 'aggs' in temp_search.keys(): del temp_search['aggs'] self.search.from_dict(temp_search) self.parent_agg_counter = 0 self.child_agg_counter = 0 self.child_agg_counter_dict = defaultdict(int) def flush_aggregations(self): """ Remove all the aggregations from the self.aggregations dict """ self.aggregations = OrderedDict() def fetch_aggregation_results(self): """ Loops though the self.aggregations dict and adds them to the Search object in order in which they were created. Queries elasticsearch and returns a dict containing the results :returns: a dictionary containing the response from elasticsearch """ self.reset_aggregations() for key, val in self.aggregations.items(): self.search.aggs.bucket(self.parent_agg_counter, val) self.parent_agg_counter += 1 self.search = self.search.extra(size=0) response = self.search.execute() self.flush_aggregations() return response.to_dict() def fetch_results_from_source(self, *fields, dataframe=False): """ Get values for specific fields in the elasticsearch index, from source :param fields: a list of fields that have to be retrieved from the index :param dataframe: if true, will return the data in the form of a pandas.DataFrame :returns: a list of dicts(key_val pairs) containing the values for the applied fields if dataframe=True, will return the a dataframe containing the data in rows and the fields representing column names """ if not fields: raise AttributeError( "Please provide the fields to get from elasticsearch!") self.reset_aggregations() self.search = self.search.extra(_source=fields) self.search = self.search.extra(size=self.size) response = self.search.execute() hits = response.to_dict()['hits']['hits'] data = [item["_source"] for item in hits] if dataframe: df = pd.DataFrame.from_records(data) return df.fillna(0) return data def get_timeseries(self, child_agg_count=0, dataframe=False): """ Get time series data for the specified fields and period of analysis :param child_agg_count: the child aggregation count to be used default = 0 :param dataframe: if dataframe=True, return a pandas.DataFrame object :returns: dictionary containing "date", "value" and "unixtime" keys with lists as values containing data from each bucket in the aggregation """ res = self.fetch_aggregation_results() ts = {"date": [], "value": [], "unixtime": []} if 'buckets' not in res['aggregations'][str(self.parent_agg_counter - 1)]: raise RuntimeError( "Aggregation results have no buckets in time series results.") for bucket in res['aggregations'][str(self.parent_agg_counter - 1)]['buckets']: ts['date'].append(parser.parse(bucket['key_as_string']).date()) if str(child_agg_count) in bucket: # We have a subaggregation with the value # If it is percentiles we get the median if 'values' in bucket[str(child_agg_count)]: val = bucket[str(child_agg_count)]['values']['50.0'] if val == 'NaN': # ES returns NaN. Convert to None for matplotlib graph val = None ts['value'].append(val) else: ts['value'].append(bucket[str(child_agg_count)]['value']) else: ts['value'].append(bucket['doc_count']) # unixtime comes in ms from ElasticSearch ts['unixtime'].append(bucket['key'] / 1000) if dataframe: df = pd.DataFrame.from_records(ts, index="date") return df.fillna(0) return ts def get_aggs(self): """ Compute the values for single valued aggregations :returns: the single aggregation value """ res = self.fetch_aggregation_results() if 'aggregations' in res and 'values' in res['aggregations'][str( self.parent_agg_counter - 1)]: try: agg = res['aggregations'][str(self.parent_agg_counter - 1)]['values']["50.0"] if agg == 'NaN': # ES returns NaN. Convert to None for matplotlib graph agg = None except Exception as e: raise RuntimeError( "Multivalue aggregation result not supported") elif 'aggregations' in res and 'value' in res['aggregations'][str( self.parent_agg_counter - 1)]: agg = res['aggregations'][str(self.parent_agg_counter - 1)]['value'] else: agg = res['hits']['total'] return agg def get_list(self, dataframe=False): """ Compute the value for multi-valued aggregations :returns: a dict containing 'keys' and their corresponding 'values' """ res = self.fetch_aggregation_results() keys = [] values = [] for bucket in res['aggregations'][str(self.parent_agg_counter - 1)]['buckets']: keys.append(bucket['key']) values.append(bucket['doc_count']) result = {"keys": keys, "values": values} if dataframe: result = pd.DataFrame.from_records(result) return result
def ESQuery( index=None, searchfields=None, returnfields=None, query=None, # search_term source_fields=None, aggregations=None, advanced=None, # used to add additional 'must' advanced search fields to search advanced_should=None, # used to add additional 'must' advanced search fields to search sort_order=[], nested=None, nested_must=None, nested_should=None, nested_range=None, preference="_primary_first", explain=None, offset=0, limit=15, fuzziness=1): s = None s = Search(using=client, index=index).source(returnfields) s = s.params(preference=preference) q = None queries = [] if nested is not None: print("Adding nested search") for nest in nested: queries.append( Q("nested", path=nest[0], query=Q("match", **{nest[1]: nest[2]}))) if nested_must is not None: for nest in nested_must: queries.append( Q("nested", path=nest[0], query=Q("match", **{nest[1]: nest[2]}))) s = s.query(Q('bool', must=queries))[offset:limit] queries = [] if nested_should is not None: for nest in nested_should: queries.append( Q("nested", path=nest[0], query=Q("match", **{nest[1]: nest[2]}))) s = s.query(Q('bool', should=queries))[offset:limit] if len(nested_should) > 1: s.query.minimum_should_match = 1 queries = [] if nested_range is not None: for nest in nested_range: queries.append( Q("nested", path=nest[0], filter=Q("range", **{nest[1]: { nest[2]: nest[3] }}))) s = s.query(Q('bool', must=queries)) queries = [] if advanced is not None: raw_queries = [] for advance in advanced: print("Adding match search on %s" % advance[0]) raw_queries.append(Q("match", **{advance[0]: advance[1]})) s = s.query(Q('bool', must=raw_queries)) if advanced_should is not None: for advance in advanced_should: queries.append(Q("match", **{advance[0]: advance[1]})) s = s.query(Q('bool', should=queries))[offset:limit] queries = [] if searchfields is not None: if query is not None and query != '': queries.append( Q("simple_query_string", query=query, default_operator="and", flags="PREFIX|PHRASE|NOT|AND|OR", fields=searchfields)) if len(queries) > 1: s = s.query(Q('bool', should=queries))[offset:limit] elif len(queries) > 0: s = s.query(Q('bool', must=queries[0]))[offset:limit] if source_fields is not None: s = s.extra(_source={'include': source_fields}) if len(sort_order) > 0: s = s.sort(sort_order[0]) if aggregations is not None: for agg in aggregations: a = A('terms', field=agg[1], size=10) s.aggs.bucket(agg[0], a) s = s.extra(explain=explain) results = s.execute() print("Query: ", json.dumps(s.to_dict())) print("Results: ", json.dumps(results.to_dict())) return results
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Apply term filters. Each tuple pairs a filter's parameter name in the API # with its corresponding field in Elasticsearch. "None" means that the # names are identical. filters = [('extension', None), ('categories', None), ('aspect_ratio', None), ('size', None), ('source', 'provider'), ('license', 'license__keyword'), ('license_type', 'license__keyword')] for tup in filters: api_field, elasticsearch_field = tup s = _apply_filter(s, search_params, api_field, elasticsearch_field) # Get suggestions for any route s = s.suggest('get_suggestion', '', term={'field': 'creator'}) # Exclude mature content unless explicitly enabled by the requester if not search_params.data['mature']: s = s.exclude('term', mature=True) # Hide data sources from the catalog dynamically. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = models.ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) to_exclude = [f['provider_identifier'] for f in filtered_providers] s = s.exclude('terms', provider=to_exclude) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query('simple_query_string', query=query, fields=search_fields) # Get suggestions for term query s = s.suggest('get_suggestion', query, term={'field': 'creator'}) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query('simple_query_string', query=creator, fields=['creator']) # Get suggestions for creator s = s.suggest('get_suggestion', creator, term={'field': 'creator'}) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query('simple_query_string', query=title, fields=['title']) # Get suggestions for title s = s.suggest('get_suggestion', title, term={'field': 'title'}) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query('simple_query_string', fields=['tags.name'], query=tags) # Get suggestions for tags s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'}) # Boost by popularity metrics if POPULARITY_BOOST: queries = [] factors = ['comments', 'views', 'likes'] boost_factor = 100 / len(factors) for factor in factors: rank_feature_query = Q('rank_feature', field=factor, boost=boost_factor) queries.append(rank_feature_query) s = Search().query( Q('bool', must=s.query, should=queries, minimum_should_match=1)) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip), request_timeout=7) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] try: search_response = s.execute() log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}') except RequestError as e: raise ValueError(e) results = _post_process_results(s, start, end, page_size, search_response, request, filter_dead) suggestion = _query_suggestions(search_response) result_count, page_count = _get_result_and_page_count( search_response, results, page_size) return results, page_count, result_count, suggestion
def get(self, request): """GET handler.""" q = request.GET.get('query_string') offset = int(request.GET.get('offset', 0)) limit = int(request.GET.get('limit', 10)) if limit > 500: return HttpResponseBadRequest("limit must not exceed 500") type_filter = request.GET.get('type_filter', 'all') doc_type_map = { list(Index(settings.ES_INDEX_PREFIX.format('publications')).get_alias().keys())[0]: 'publication', list(Index(settings.ES_INDEX_PREFIX.format('publications-legacy')).get_alias().keys())[0]: 'publication', list(Index(settings.ES_INDEX_PREFIX.format('files')).get_alias().keys())[0]: 'file', list(Index(settings.ES_INDEX_PREFIX.format('cms')).get_alias().keys())[0]: 'modelresult' } public_files_query = CommunityDataSearchManager(request).construct_query() | PublishedDataSearchManager(request).construct_query() publications_query = PublicationsSiteSearchManager(request).construct_query() cms_query = es_query = CMSSearchManager(request).construct_query() if type_filter == 'public_files': es_query = Search().query(public_files_query) elif type_filter == 'published': es_query = Search().query(publications_query) elif type_filter == 'cms': es_query = Search().query(cms_query).highlight( 'body', fragment_size=100).highlight_options( pre_tags=["<b>"], post_tags=["</b>"], require_field_match=False) elif type_filter == 'all': es_query = Search().query(public_files_query | publications_query | cms_query).highlight( 'body', fragment_size=100).highlight_options( pre_tags=["<b>"], post_tags=["</b>"], require_field_match=False) es_query = es_query.extra(from_=offset, size=limit) try: res = es_query.execute() except (TransportError, ConnectionTimeout) as err: if getattr(err, 'status_code', 500) == 404: raise res = es_query.execute() out = {} hits = [] for r in res: d = r.to_dict() d["doc_type"] = doc_type_map[r.meta.index] if hasattr(r.meta, 'highlight'): highlight = r.meta.highlight.to_dict() d["highlight"] = highlight if r.meta.doc_type == 'publication' and hasattr(r, 'users'): users = r.users pi = r.project.value.pi pi_user = [x for x in users if x.username==pi][0] d["piLabel"] = "{}, {}".format(pi_user.last_name, pi_user.first_name) hits.append(d) out['hits'] = hits out['all_total'] = Search().query(public_files_query | publications_query | cms_query).count() out['public_files_total'] = Search().query(public_files_query).count() out['published_total'] = Search().query(publications_query).count() out['cms_total'] = Search().query(cms_query).count() print(out) return JsonResponse(out, safe=False)