class MultiSearch(object): def __init__(self, index=None, queries=None): self.index = index self._queries = BaseMultiSearch( index=self.index._meta.index if index else None) for query in queries or []: self.add(query) def raw(self, raw_dict): return Search().raw(raw_dict) def filter(self, *args, **kw): return Search().filter(*args, **kw) def query(self, *args, **kw): return Search().query(*args, **kw) def add(self, *queries): for query in queries: self._queries = self._queries.add(query) def execute(self): return self._queries.execute() def __iter__(self): return iter(self.execute()) def __len__(self): return len(self._queries)
def run_searches(es, index, searches): """Ejecuta una lista de búsquedas Elasticsearch. Internamente, se utiliza la función MultiSearch. Args: es (Elasticsearch): Conexión a Elasticsearch. index (str): Nombre del índice sobre el cual se deberían ejecutar las queries. searches (list): Lista de búsquedas, de tipo Search. Raises: DataConnectionException: si ocurrió un error al ejecutar las búsquedas. Returns: list: Lista de resultados, cada resultado contiene una lista de 'hits' (documentos encontrados). """ ms = MultiSearch(index=index, using=es) for search in searches: ms = ms.add(search) try: responses = ms.execute(raise_on_error=True) return [[hit.to_dict() for hit in response.hits] for response in responses] except elasticsearch.ElasticsearchException: raise DataConnectionException()
def mcount_buckets(self, buckets): ms = MultiSearch(using=self.es) for bucket_name in buckets: search = Search(using=self.es, index="{}*".format(TMUtils.MAP_PREFIX)) search.aggs.bucket('indexes', 'terms', field='_index', size=999999).bucket('values', 'terms', field=bucket_name, size=999999) ms = ms.add(search) mres = ms.execute() lang2buckets = dict() for bucket_name, res in zip(buckets, mres): if hasattr(res, "aggregations") and 'indexes' in res.aggregations: triple_list = [(re.sub("^{}".format(TMUtils.MAP_PREFIX), "", x.key), y.key, y.doc_count) for x in res.aggregations['indexes'].buckets for y in x['values'].buckets] for lang_pair, bucket_value, count in triple_list: lang2buckets.setdefault(lang_pair, dict()).setdefault( bucket_name, dict())[bucket_value] = count return lang2buckets
def _run_multisearch(es, searches): """Ejecuta una lista de búsquedas Elasticsearch utilizando la función MultiSearch. La cantidad de búsquedas que se envían a la vez es configurable vía la variable ES_MULTISEARCH_MAX_LEN. Args: es (Elasticsearch): Conexión a Elasticsearch. searches (list): Lista de elasticsearch_dsl.Search. Raises: DataConnectionException: Si ocurrió un error al ejecutar las búsquedas. Returns: list: Lista de respuestas a cada búsqueda. """ step_size = constants.ES_MULTISEARCH_MAX_LEN responses = [] # Partir las búsquedas en varios baches si es necesario. for i in range(0, len(searches), step_size): end = min(i + step_size, len(searches)) ms = MultiSearch(using=es) for j in range(i, end): ms = ms.add(searches[j]) try: responses.extend(ms.execute(raise_on_error=True)) except elasticsearch.ElasticsearchException as e: raise DataConnectionException() from e return responses
def mget(self, id_langs, return_multiple=False): if not id_langs: return [] msearch = MultiSearch(using=self.es) search_swap = [] for source_id, source_lang, target_lang in id_langs: search, swap = self._create_search(source_id, source_lang, target_lang) if search: # Sort by update date so in case of multiple segments having the same source, the latest one will be returned search = search.sort('-update_date') msearch = msearch.add(search) search_swap.append(swap) responses = msearch.execute() results = [] for res, swap in zip(responses, search_swap): try: if not 'hits' in res or not res.hits.total: results.append(None) continue for ret_doc in res.hits: # Exchange source and target (if needed) if swap: ret_doc = self._swap(ret_doc) results.append(ret_doc) if not return_multiple: break except: # Exception is thrown if Response is in some invalid state (no hits, hits are empty) logging.warning("Invalid Response object: {}".format( res.to_dict())) results.append(None) continue return results
def mexist(self, src_lang, src_ids): if not src_ids: return [] tgt_langs = [ target_lang for target_lang in self.lang_graph.neighbors(src_lang) ] MEXIST_BATCH_SIZE = 10 results = [] for i in range(0, len(src_ids), MEXIST_BATCH_SIZE): msearch = MultiSearch(using=self.es) for source_id in src_ids[i:i + MEXIST_BATCH_SIZE]: search = self._create_search_mindexes(source_id, src_lang, tgt_langs) if search: msearch = msearch.add(search) responses = msearch.execute() for res in responses: try: results.append(bool('hits' in res and res.hits.total)) except: # Exception is thrown if Response is in some invalid state (no hits, hits are empty) logging.warning("Invalid Response object: {}".format( res.to_dict())) results.append(None) return results
def _fetch_word_freqs_per_day( self, dataset_widget: DatasetWidget, ) -> Tuple[Mapping[str, Sequence[int]], Sequence[int], int]: _LOGGER.debug("Fetching word frequencies per day.") search_helper = SearchHelper(dataset_widget.dataset.type) search_template = Search().extra(size=0, track_total_hits=True) search_template = dataset_widget.set_search(search_template) search_template = search_helper.add_agg_text_tokens_terms( search_template, size=self._top_n_words) search = MultiSearch() for cur_date in date_range(self._min_date, self._max_date): search = search.add( search_template.filter( search_helper.query_date_range(gte=cur_date, lt=cur_date + timedelta(days=1)))) time_before = time() responses = search.execute() time_after = time() took_msecs = int((time_after - time_before) * 1000) word_freqs = defaultdict(lambda: [0] * len(responses)) num_docs = [] for i, response in enumerate(responses): num_docs.append(response.hits.total.value) for bucket in search_helper.read_agg_text_tokens_terms(response): word_freqs[bucket.key][i] = bucket.doc_count return word_freqs, num_docs, took_msecs
def select_fields(all_fields, search, number_of_groups): ''' Selects the fields from the given Fields object which are most common across the given resource ids. The search parameter is used to limit the records that contribute fields to the returned selection. The fields returned must appear in the search in at least one resource with at least one value present. :param all_fields: a Fields object :param search: an elasticsearch-dsl search object :param number_of_groups: the number of groups to select from the Fields object and return :return: a list of groups, each group is a dict containing: - "group" - the group name - "count" - the number of resources its fields appear in - "records" - the number of records the group's fields appear in - "fields" - the fields that make up the group along with the resource ids they come from - "forced" - whether the field was forced into being included, or whether it was included organically ''' selected_fields = [] # make sure we don't get any hits back, we're only interested in the counts search = search.extra(size=0) # iterate over the groups and searches in chunks for chunk in chunk_iterator(all_fields.get_searches(search), chunk_size=number_of_groups): groups, searches = zip(*chunk) # create a multisearch for all the searches in the group multisearch = MultiSearch(using=common.ES_CLIENT) for search in searches: multisearch = multisearch.add(search) for (group, count, fields), response in zip(groups, multisearch.execute()): if all_fields.is_forced(group) or response.hits.total > 0: # a field from this group has values in the search result, add it to the selection selected_fields.append( dict(group=group, count=count, records=response.hits.total, fields=fields, forced=all_fields.is_forced(group))) if len(selected_fields) >= number_of_groups: break def group_sorter(the_group): # this sorts the groups ensuring forced groups are first, in the order they were forced, # then the groups with highest count and then the ones with the highest number of records if the_group[u'forced']: # use 0 0 to ensure that the base order of the groups is maintained for forced groups return True, 0, 0 else: return False, the_group[u'count'], the_group[u'records'] # sort the returned selected list by count and secondly records return sorted(selected_fields, key=group_sorter, reverse=True)
def multi_search(searchs): ms = MultiSearch(using=conn, index="log-index") for search in searchs: ms = ms.add(search) response = ms.execute() return response
def get_multi_search(self): multi_search = MultiSearch() search = self.get_search() multi_search = multi_search.add(search) if self.args.get(constants.PARAM_AGGREGATIONS) is not None: multi_search = self.add_terms_aggregations(multi_search) return multi_search
def build(self, q=None, **options): """ Build a query according to q and options. This is the public method called by API handlers. Regarding scopes: scopes: [str] nonempty, match query. scopes: NoneType, or [], no scope, so query string query. Additionally support these options: explain: include es scoring information userquery: customized function to interpret q * additional keywords are passed through as es keywords for example: 'explain', 'version' ... * multi-search is supported when q is a list. all queries are built individually and then sent in one request. """ options = dotdict(options) if options.scroll_id: # bypass all query building stages return ESScrollID(options.scroll_id) if options.fetch_all: # clean up conflicting parameters options.pop('sort', None) options.pop('size', None) try: # process single q vs list of q(s). # dispatch 'val' vs 'key:val' to corresponding functions. if isinstance(q, list): search = MultiSearch() for _q in q: _search = self._build_one(_q, options) search = search.add(_search) else: # str, int ... search = self._build_one(q, options) except IllegalOperation as exc: raise ValueError(str(exc)) # ex. sorting by -_score if options.get('rawquery'): raise RawQueryInterrupt(search.to_dict()) return search
def get_usernames_for_crawl(): ms = MultiSearch(index='populars') q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}}) never_updated = Search().query(q) total = never_updated.count() never_updated = never_updated[0:total] old_updated = Search().query('range', last_update={"lte": "now-2d"}) total = old_updated.count() old_updated = old_updated[0:total] ms = ms.add(never_updated) ms = ms.add(old_updated) responses = ms.execute() for res in responses: for hit in res: yield (hit.username)
def test_multi_search(data_client): s1 = Repository.search() s2 = Search(index="flat-git") ms = MultiSearch() ms = ms.add(s1).add(s2) r1, r2 = ms.execute() assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1._search is s1 assert 52 == r2.hits.total.value assert r2._search is s2
def test_multi_search(data_client): s1 = Repository.search() s2 = Search(index='flat-git') ms = MultiSearch() ms = ms.add(s1).add(s2) r1, r2 = ms.execute() assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1._search is s1 assert 52 == r2.hits.total assert r2._search is s2
def test_multi_search(data_client): s1 = Repository.search() s2 = Search(doc_type='commits') ms = MultiSearch(index='git') ms = ms.add(s1).add(s2) r1, r2 = ms.execute() assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1.search is s1 assert 52 == r2.hits.total assert r2.search is s2
def multisearch(*models, **params): ms = MultiSearch(using=es.client, index=es.index_name) queries = [] for model in models: s = search_for(model, **params) ms = ms.add(s._s) queries.append(s) responses = ms.execute() return [ # _d_ is the only way to access the raw data # allowing to rewrap response in a FacetedSearch # because default multisearch loose facets SearchResult(query, response._d_) for query, response in zip(queries, responses) ]
def __init__(self, index=None, queries=None): self.index = index self._queries = BaseMultiSearch( index=self.index._meta.index if index else None) for query in queries or []: self.add(query)
def get(self, request, *args, **kwargs): query = request.GET.get('q') coords = [ request.GET.get('latitude'), request.GET.get('longitude'), request.GET.get('radius') ] latitude, longitude, radius = get_user_coordinates(coords, request) ms = MultiSearch(index=['restaurants', 'categories']) if query: cs = CategoryDocument.search().query("query_string", query=query, default_field="label") q = Q('query_string', query=query, default_field='name') q |= Q('nested', path='categories', query=Q('query_string', query=query, default_field='categories.label')) rs = RestaurantDocument.search().filter('geo_distance', distance='%smi' % radius, location={ "lat": latitude, "lon": longitude }).query(q) ms = ms.add(cs) ms = ms.add(rs) responses = ms.execute() aggregate = [] for response in responses: hits = response['hits']['hits'] aggregate += [hit.to_dict() for hit in hits] else: cs = CategoryDocument.search().source([]) cs = cs[0:10] response = cs.execute() hits = response['hits']['hits'] aggregate = [hit.to_dict() for hit in hits] return Response(aggregate)
def query(self, queries, size, record_fnum): ms = MultiSearch(using=self.es, index=self.index_name) for q in queries: s = Search().query("match", userid=q[0]).query("match", record=q[1])[:size] ms = ms.add(s) responses = ms.execute() res_batch = [] for response in responses: res = [] for hit in response: res.append([int(hit.userid)] + list(map(int, hit.record.split(',')))) if len(res) < size: res += [np.zeros([record_fnum,]).astype(np.int32).tolist()] * (size - len(res)) res_batch.append(res) return res_batch
def get_queryset(self, queryset, data): phrase = data.get('q') if 'models' not in data: models = self._supported_models else: models = data['models'].split(',') advanced = data.get('advanced') op, suffix = get_advanced_options(advanced) lang = get_language() per_model = data.get('per_model', 1) ms = MultiSearch(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME) for model in models: if is_enabled('S39_filter_by_geodata.be' ) and model in self._completion_models: sug_query = Search(index=f'{model}s') sug_query = sug_query.suggest('title', phrase, completion={ 'field': f'title.{lang}.suggest', 'size': per_model }) res = sug_query.execute() suggestions = res.suggest['title'][0] ids = [sug['_id'] for sug in suggestions['options']] query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME) query = query.filter('term', model=model).query('ids', values=ids) else: query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME) query = query.filter('term', model=model) query = query.query('bool', should=[ nested_query_with_advanced_opts( phrase, field, lang, op, suffix) for field in ('title', 'notes') ]) query = query.extra(size=per_model) ms = ms.add(query) return ms
def build(self, q, **options): # NOTE # GRAPH QUERY CUSTOMIZATION # ONE if isinstance(q, GraphQuery): return self.build_graph_query(q, **options) # MULTI elif isinstance(q, GraphQueries): search = MultiSearch() for _q in q: search = search.add(self.build_graph_query(_q, **options)) return search else: # NOT GRAPH return super().build(q, **options)
class Getdata(): es = Elasticsearch(hosts='http://124.49.54.38:19200/') search = Search(using=es, index='test-2018.09.11').params(request_timeout=30) multi = MultiSearch(using=es, index='test-2018.09.11').params(request_timeout=30) res = search.execute() total = search.count()
def execute_searches(self): """Ejecuta la query de todas las series agregadas, e inicializa los atributos data y count a partir de las respuestas. """ if not self.series: raise QueryError(strings.EMPTY_QUERY_ERROR) multi_search = MultiSearch(index=self.index, doc_type=settings.TS_DOC_TYPE, using=self.elastic) for serie in self.series: serie.add_collapse(self.args[constants.PARAM_PERIODICITY]) multi_search = multi_search.add(serie.search) responses = multi_search.execute() formatter = ResponseFormatter(self.series, responses, self.args) self.data = formatter.format_response() self.count = max([response.hits.total for response in responses])
def execute_searches(self): """Ejecuta la query de todas las series agregadas, e inicializa los atributos data y count a partir de las respuestas. """ multi_search = MultiSearch(index=self.index, doc_type=settings.TS_DOC_TYPE) for serie in self.series: multi_search = multi_search.add(serie.search) responses = multi_search.execute() formatter = ResponseFormatter(self.series, responses, self.args[constants.PARAM_SORT], self.args[constants.PARAM_PERIODICITY]) return { 'data': (formatter.format_response()), 'count': max([response.hits.total for response in responses]) }
def calculate_field_counts(request, es_client): ''' Given a download request and an elasticsearch client to work with, work out the number of values available per field, per resource for the search. :param request: the DownloadRequest object :param es_client: the elasticsearch client to use :return: a dict of resource ids -> fields -> counts ''' field_counts = defaultdict(dict) for resource_id, version in request.resource_ids_and_versions.items(): index_name = prefix_resource(resource_id) # get the base field mapping for the index so that we know which fields to look up, this # will get all fields from all versions and therefore isn't usable straight off the bat, we # have to then go and see which fields are present in the search at this version mapping = es_client.indices.get_mapping(index_name)[index_name] # we're going to do a multisearch to find out the number of records a value for each field # from the mapping search = MultiSearch(using=es_client, index=index_name) base_search = Search.from_dict(request.search) \ .index(index_name) \ .using(es_client) \ .extra(size=0) \ .filter(create_version_query(version)) # get all the fields names and use dot notation for nested fields fields = [ u'.'.join(parts) for parts, _config in iter_data_fields(mapping) ] for field in fields: # add a search which finds the documents that have a value for the given field at the # right version search = search.add( base_search.filter(u'exists', field=prefix_field(field))) responses = search.execute() for field, response in zip(fields, responses): field_counts[resource_id][field] = response.hits.total return field_counts
def test_multi_missing(data_client): s1 = Repository.search() s2 = Search(index='flat-git') s3 = Search(index='does_not_exist') ms = MultiSearch() ms = ms.add(s1).add(s2).add(s3) with raises(TransportError): ms.execute() r1, r2, r3 = ms.execute(raise_on_error=False) assert 1 == len(r1) assert isinstance(r1[0], Repository) assert r1._search is s1 assert 52 == r2.hits.total assert r2._search is s2 assert r3 is None
def find_searched_resources(search, resource_ids): ''' Given a search and a list of resource ids to search in, returns a list of the resources that are actually included in the search results. :param search: an elasticsearch-dsl object :param resource_ids: a list of resource ids :return: a list of resource ids ''' # we have to make a copy as aggs don't return a clone :( search_copy = copy(search) search_copy = search_copy.index( [prefix_resource(resource_id) for resource_id in resource_ids]) search_copy.aggs.bucket(u'indexes', u'terms', field=u'_index') multisearch = MultiSearch(using=common.ES_CLIENT).add(search_copy) result = next(iter(multisearch.execute())) return [ trim_index_name(bucket[u'key']) for bucket in result.aggs.to_dict()[u'indexes'][u'buckets'] if bucket[u'doc_count'] > 0 ]
def simple_search_public_data(query_text): result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", public_access=100) query_exp = query_exp & query_exp_oacl ms = ms.add(Search(index='experiments') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.public_access': 100}) ms = ms.add(Search(index='dataset') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset) .query('nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__public_access=100) query_datafile = query_datafile & query_datafile_oacl ms = ms.add(Search(index='datafile') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_datafile)) results = ms.execute() for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return result_dict
def get_object_list(self, request): user = request.user query_text = request.GET.get('query', None) if not user.is_authenticated: result_dict = simple_search_public_data(query_text) return [SearchObject(id=1, hits=result_dict)] groups = user.groups.all() index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", objectacls__entityId=user.id) | \ Q("term", public_access=100) for group in groups: query_exp_oacl = query_exp_oacl | \ Q("term", objectacls__entityId=group.id) query_exp = query_exp & query_exp_oacl ms = ms.add( Search(index='experiments').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \ Q("term", **{'experiments.public_access': 100}) for group in groups: query_dataset_oacl = query_dataset_oacl | \ Q("term", **{'experiments.objectacls.entityId': group.id}) ms = ms.add( Search(index='dataset').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset).query( 'nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \ Q("term", experiments__public_access=100) for group in groups: query_datafile_oacl = query_datafile_oacl | \ Q("term", experiments__objectacls__entityId=group.id) query_datafile = query_datafile & query_datafile_oacl ms = ms.add( Search(index='datafile').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_datafile)) results = ms.execute() result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return [SearchObject(id=1, hits=result_dict)]
def execute_queries(self, queries: Dict[Resource, Q], page_index: int, results_per_page: int) -> List[Response]: multisearch = MultiSearch(using=self.elasticsearch) for resource in queries.keys(): query_for_resource = queries.get(resource) search = Search(index=self.get_index_for_resource(resource_type=resource)).query(query_for_resource) LOGGER.info(search.to_dict()) # pagination start_from = page_index * results_per_page end = results_per_page * (page_index + 1) search = search[start_from:end] multisearch = multisearch.add(search) try: response = multisearch.execute() return response except Exception as e: LOGGER.error(f'Failed to execute ES search queries. {e}') return []
def run_multiple_filters(): while True: index = "book" ms = MultiSearch(index=index) ask_price_filter = AskPriceFilter(index) search_ask = ask_price_filter.main_query( gt_price=50, lt_price=52, from_range=10, to_range=20 ) ms = ms.add(search_ask) bid_price_filter = BidPriceFilter(index) search_bid = bid_price_filter.main_query(gt_price=50, lt_price=51, to_range=15) ms = ms.add(search_bid) responses = ms.execute() # returns a list of Response objects for resp in responses: print(len(resp)) print(resp.hits.total.value) ask_price_filter.show_result(from_range=0, to_range=5) bid_price_filter.show_result(from_range=0, to_range=5) time.sleep(5)
def __init__(self, es, index, limit=10, q=None, filter=None): self.search = list()#Search(using=es, index=index) self.msearch = MultiSearch(using=es, index=index) self.queries = list() self.num_segs = 0 self.limit = limit # Build queries if isinstance(q, str): q = [q] self._build(q) # Build filter(s) if isinstance(filter, dict): filter = [filter] self._filter(filter, es, index)
def run(self, network, channels, query, author=None, date_range=None): # We don't support non-ajax, so will always have date range assert date_range date_begin, date_end = date_range result = Search( using=self.es, index='moffle', ).query( "match", text=query, ).query( "range", date={ 'gt': date_begin.strftime('%Y%m%d'), 'lte': date_end.strftime('%Y%m%d'), }, ).filter( "terms", line_type=['normal', 'action'], ).filter( "term", network=network, ).filter( "terms", channel=channels, ).sort( "-date", )[:10000].execute() hits = [] # TODO: interval merging ctx_search = MultiSearch(using=self.es, index='moffle') for hit in result: # Fetch context ctx_search = ctx_search.add(Search( using=self.es, index='moffle', ).query( "range", line_no={ "gte": hit.line_no - config.SEARCH_CONTEXT, "lte": hit.line_no + config.SEARCH_CONTEXT, }, ).filter( "term", network=hit.network, ).filter( "term", channel=hit.channel, ).filter( "term", date=hit.date, ).sort( "line_no", )) ctx_results = ctx_search.execute() for hit, ctx_result in zip(result, ctx_results): lines = [] for ctx_hit in ctx_result: lines.append(self._format_line( ctx_hit, is_hit=(hit.line_no == ctx_hit.line_no), )) hit = Hit( channel=hit.channel, date=hit.date, begin=lines[0].line_no, lines=lines, ) hits.append(hit) hits = [list(group) for _, group in groupby(hits, key=lambda hit: hit.date)] return hits
def batch_request(cls, names): """ Map all name fragments in the array to name hashes. Takes an array of arrays (names are tokenized) and returns hashes and labels from ES. """ # TODO: THROW IT AWAY AND REPLACE WITH DAWG def search_clause(term): # TODO: case for initials return cls.search().filter("term", term=term) def transform_resp(resp): labels = list(set(resp.lemma_labels) - {"lemma"}) assert len(labels) == 1 label = { "lemma-firstname": "firstname", "lemma-patronymic": "patronymic", "lemma-lastname": "lastname", "lemma-firstname-typo": "firstname", "lemma-patronymic-typo": "patronymic", "lemma-lastname-typo": "lastname" }[labels[0]] return { "term": resp.term, "lemma": resp.lemma, "label": label } def match_req_resp(name, hashes): res = [] for chunk, resp in zip(name, hashes): if resp: res.append(list(map(transform_resp, resp))) else: res.append([{ "lemma": sha1((chunk + "thisissalt").encode('utf-8')).hexdigest(), "label": "no-match", "term": chunk }]) return res qs = MultiSearch(index=cls._doc_type.index) for name in names: for chunk in name: qs = qs.add(search_clause(chunk)) response = qs.execute() results = [] pos = 0 for name in names: l = len(name) res_chunk = match_req_resp(name, response[pos:pos + l]) results.append(res_chunk) pos += l return results
def _execute_multi_search(self, page, num_results): indices = self.samples_by_family_index.keys() if not self.previous_search_results.get('loaded_variant_counts'): self.previous_search_results['loaded_variant_counts'] = {} ms = MultiSearch() for index_name in indices: start_index = 0 if self.previous_search_results['loaded_variant_counts'].get(index_name): index_total = self.previous_search_results['loaded_variant_counts'][index_name]['total'] start_index = self.previous_search_results['loaded_variant_counts'][index_name]['loaded'] if start_index >= index_total: continue else: self.previous_search_results['loaded_variant_counts'][index_name] = {'loaded': 0, 'total': 0} searches = self._get_paginated_searches(index_name, page, num_results, start_index=start_index) ms = ms.index(index_name) for search in searches: ms = ms.add(search) responses = self._execute_search(ms) new_results = [] compound_het_results = self.previous_search_results.get('compound_het_results', []) for response in responses: response_hits, response_total, is_compound_het = self._parse_response(response) if not response_total: continue index_name = response.hits[0].meta.index if is_compound_het: compound_het_results += response_hits self.previous_search_results['loaded_variant_counts']['{}_compound_het'.format(index_name)] = {'total': response_total} else: new_results += response_hits self.previous_search_results['loaded_variant_counts'][index_name]['total'] = response_total self.previous_search_results['loaded_variant_counts'][index_name]['loaded'] += len(response_hits) self.previous_search_results['total_results'] = sum(counts['total'] for counts in self.previous_search_results['loaded_variant_counts'].values()) # combine new results with unsorted previously loaded results to correctly sort/paginate all_loaded_results = self.previous_search_results.get('all_results', []) previous_page_record_count = (page - 1) * num_results if len(all_loaded_results) >= previous_page_record_count: loaded_results = all_loaded_results[:previous_page_record_count] new_results += all_loaded_results[previous_page_record_count:] else: loaded_results = [] new_results += self.previous_search_results.get('variant_results', []) new_results = sorted(new_results, key=lambda variant: variant['_sort']) variant_results = self._deduplicate_results(new_results) if compound_het_results or self.previous_search_results.get('grouped_results'): if compound_het_results: compound_het_results = self._deduplicate_compound_het_results(compound_het_results) return self._process_compound_hets(compound_het_results, variant_results, num_results) else: self.previous_search_results['all_results'] = loaded_results + variant_results return variant_results[:num_results]
#!/usr/bin/env python from elasticsearch import Elasticsearch from elasticsearch_dsl import MultiSearch, Search client = Elasticsearch(['192.168.33.108:9200','192.168.33.109:9200']) # multi search "hello" on message field. ms = MultiSearch(using=client,index='logstash-*') ms = ms.add(Search().query("match", message="hello")) ms = ms.add(Search().query("match", message="hello7")) responses = ms.execute() for response in responses: for r in response: print(r['host'], r['message'])
class MultiSearchConductor: def __init__(self): self.field_counts = {} self.multi_search = MultiSearch() def query_conductor(self, indices, query_body, elasticsearch, es_url, excluded_fields): result = {} list_of_indices = indices.split(',') for index in list_of_indices: # Fetch all the fields and their types, then filter the ones we don't want like _texta_id. normal_fields, nested_fields = DashboardEsHelper(es_url=es_url, indices=index).get_aggregation_field_data() normal_fields, nested_fields = self._filter_excluded_fields(excluded_fields, normal_fields, nested_fields, ) # Attach all the aggregations to Elasticsearch, depending on the fields. # Text, keywords get term aggs etc. self._normal_fields_handler(normal_fields, index=index, query_body=query_body, elasticsearch=elasticsearch) self._texta_facts_agg_handler(index=index, query_body=query_body, elasticsearch=elasticsearch) # Send the query towards Elasticsearch and then save it into the result # dict under its index's name. responses = self.multi_search.using(elasticsearch).execute() result[index] = [response.to_dict() for response in responses] return result def _normal_fields_handler(self, list_of_normal_fields, query_body, index, elasticsearch): for field_dict in list_of_normal_fields: field_type = field_dict['type'] field_name = field_dict['full_path'] clean_field_name = self._remove_dot_notation(field_name) search_gateway = elasticsearch_dsl.Search(index=index).using(elasticsearch) self.field_counts[field_name] = search_gateway.query("exists", field=clean_field_name).count() # Do not play around with the #, they exist to avoid naming conflicts as awkward as they may be. # TODO Find a better solution for this. if field_type == "text": if query_body is not None: search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket("sigsterms#{0}#text_sigterms".format(field_name), 'significant_text', field=field_name, filter_duplicate_text=True) self.multi_search = self.multi_search.add(search_dsl) elif field_type == "keyword": search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket("sterms#{0}#keyword_terms".format(field_name), 'terms', field=field_name) self.multi_search = self.multi_search.add(search_dsl) elif field_type == "date": search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket("date_histogram#{0}_month#date_month".format(field_name), 'date_histogram', field=field_name, interval='month') search_dsl.aggs.bucket("date_histogram#{0}_year#date_year".format(field_name), 'date_histogram', field=field_name, interval='year') self.multi_search = self.multi_search.add(search_dsl) elif field_type == "integer": search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket("extended_stats#{0}#int_stats".format(field_name), 'extended_stats', field=field_name) self.multi_search = self.multi_search.add(search_dsl) elif field_type == "long": search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket('extended_stats#{0}#long_stats'.format(field_name), 'extended_stats', field=field_name) self.multi_search = self.multi_search.add(search_dsl) elif field_type == "float": search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket("extended_stats#{0}#float_stats".format(field_name), 'extended_stats', field=field_name) self.multi_search = self.multi_search.add(search_dsl) def _texta_facts_agg_handler(self, query_body, index, elasticsearch): search_dsl = self._create_search_object(query_body=query_body, index=index, elasticsearch=elasticsearch) search_dsl.aggs.bucket("nested#texta_facts", 'nested', path='texta_facts') \ .bucket('sterms#fact_category', 'terms', field='texta_facts.fact', collect_mode="breadth_first") \ .bucket("sigsterms#significant_facts", 'significant_terms', field='texta_facts.str_val') self.multi_search = self.multi_search.add(search_dsl) def _filter_excluded_fields(self, excluded_fields, normal_fields, nested_fields): normal_fields = list(filter(lambda x: x['full_path'] not in excluded_fields, normal_fields)) nested_fields = list(filter(lambda x: x['full_path'] not in excluded_fields, nested_fields)) return normal_fields, nested_fields def _remove_dot_notation(self, field_name): """ Removes all the .'s in the field names to avoid potential conflicts in the front end. :param field_name: Name of a field inside Elasticsearch, ex article_lead.keyword :return: Name of the field but the comma removed. ex article_lead """ if '.' in field_name: field_name = field_name.split('.')[0] return field_name else: return field_name def _create_search_object(self, query_body, index, elasticsearch): if query_body: search = elasticsearch_dsl.Search.from_dict(query_body).index(index).using(elasticsearch).extra(size=0).source(False) return search else: search = elasticsearch_dsl.Search().index(index).extra(size=0).source(False) return search
def __init__(self): self.field_counts = {} self.multi_search = MultiSearch()