Example #1
0
    def _fetch_word_freqs_per_day(
        self,
        dataset_widget: DatasetWidget,
    ) -> Tuple[Mapping[str, Sequence[int]], Sequence[int], int]:
        _LOGGER.debug("Fetching word frequencies per day.")

        search_helper = SearchHelper(dataset_widget.dataset.type)
        search_template = Search().extra(size=0, track_total_hits=True)
        search_template = dataset_widget.set_search(search_template)
        search_template = search_helper.add_agg_text_tokens_terms(
            search_template, size=self._top_n_words)

        search = MultiSearch()
        for cur_date in date_range(self._min_date, self._max_date):
            search = search.add(
                search_template.filter(
                    search_helper.query_date_range(gte=cur_date,
                                                   lt=cur_date +
                                                   timedelta(days=1))))

        time_before = time()
        responses = search.execute()
        time_after = time()
        took_msecs = int((time_after - time_before) * 1000)

        word_freqs = defaultdict(lambda: [0] * len(responses))
        num_docs = []
        for i, response in enumerate(responses):
            num_docs.append(response.hits.total.value)
            for bucket in search_helper.read_agg_text_tokens_terms(response):
                word_freqs[bucket.key][i] = bucket.doc_count

        return word_freqs, num_docs, took_msecs
Example #2
0
def run_searches(es, index, searches):
    """Ejecuta una lista de búsquedas Elasticsearch. Internamente, se utiliza
    la función MultiSearch.

    Args:
        es (Elasticsearch): Conexión a Elasticsearch.
        index (str): Nombre del índice sobre el cual se deberían ejecutar las
            queries.
        searches (list): Lista de búsquedas, de tipo Search.

    Raises:
        DataConnectionException: si ocurrió un error al ejecutar las búsquedas.

    Returns:
        list: Lista de resultados, cada resultado contiene una lista de 'hits'
            (documentos encontrados).

    """
    ms = MultiSearch(index=index, using=es)

    for search in searches:
        ms = ms.add(search)

    try:
        responses = ms.execute(raise_on_error=True)

        return [[hit.to_dict() for hit in response.hits]
                for response in responses]
    except elasticsearch.ElasticsearchException:
        raise DataConnectionException()
Example #3
0
def _run_multisearch(es, searches):
    """Ejecuta una lista de búsquedas Elasticsearch utilizando la función
    MultiSearch. La cantidad de búsquedas que se envían a la vez es
    configurable vía la variable ES_MULTISEARCH_MAX_LEN.

    Args:
        es (Elasticsearch): Conexión a Elasticsearch.
        searches (list): Lista de elasticsearch_dsl.Search.

    Raises:
        DataConnectionException: Si ocurrió un error al ejecutar las búsquedas.

    Returns:
        list: Lista de respuestas a cada búsqueda.

    """
    step_size = constants.ES_MULTISEARCH_MAX_LEN
    responses = []

    # Partir las búsquedas en varios baches si es necesario.
    for i in range(0, len(searches), step_size):
        end = min(i + step_size, len(searches))
        ms = MultiSearch(using=es)

        for j in range(i, end):
            ms = ms.add(searches[j])

        try:
            responses.extend(ms.execute(raise_on_error=True))
        except elasticsearch.ElasticsearchException as e:
            raise DataConnectionException() from e

    return responses
Example #4
0
    def mexist(self, src_lang, src_ids):
        if not src_ids: return []
        tgt_langs = [
            target_lang for target_lang in self.lang_graph.neighbors(src_lang)
        ]

        MEXIST_BATCH_SIZE = 10
        results = []
        for i in range(0, len(src_ids), MEXIST_BATCH_SIZE):
            msearch = MultiSearch(using=self.es)
            for source_id in src_ids[i:i + MEXIST_BATCH_SIZE]:
                search = self._create_search_mindexes(source_id, src_lang,
                                                      tgt_langs)
                if search:
                    msearch = msearch.add(search)
            responses = msearch.execute()
            for res in responses:
                try:
                    results.append(bool('hits' in res and res.hits.total))
                except:
                    # Exception is thrown if Response is in some invalid state (no hits, hits are empty)
                    logging.warning("Invalid Response object: {}".format(
                        res.to_dict()))
                    results.append(None)
        return results
Example #5
0
    def mcount_buckets(self, buckets):
        ms = MultiSearch(using=self.es)
        for bucket_name in buckets:
            search = Search(using=self.es,
                            index="{}*".format(TMUtils.MAP_PREFIX))
            search.aggs.bucket('indexes', 'terms', field='_index',
                               size=999999).bucket('values',
                                                   'terms',
                                                   field=bucket_name,
                                                   size=999999)
            ms = ms.add(search)

        mres = ms.execute()

        lang2buckets = dict()
        for bucket_name, res in zip(buckets, mres):
            if hasattr(res, "aggregations") and 'indexes' in res.aggregations:
                triple_list = [(re.sub("^{}".format(TMUtils.MAP_PREFIX), "",
                                       x.key), y.key, y.doc_count)
                               for x in res.aggregations['indexes'].buckets
                               for y in x['values'].buckets]
                for lang_pair, bucket_value, count in triple_list:
                    lang2buckets.setdefault(lang_pair, dict()).setdefault(
                        bucket_name, dict())[bucket_value] = count

        return lang2buckets
Example #6
0
    def mget(self, id_langs, return_multiple=False):
        if not id_langs: return []
        msearch = MultiSearch(using=self.es)
        search_swap = []
        for source_id, source_lang, target_lang in id_langs:
            search, swap = self._create_search(source_id, source_lang,
                                               target_lang)
            if search:
                # Sort by update date so in case of multiple segments having the same source, the latest one will be returned
                search = search.sort('-update_date')
                msearch = msearch.add(search)
                search_swap.append(swap)

        responses = msearch.execute()
        results = []
        for res, swap in zip(responses, search_swap):
            try:
                if not 'hits' in res or not res.hits.total:
                    results.append(None)
                    continue
                for ret_doc in res.hits:
                    # Exchange source and target (if needed)
                    if swap: ret_doc = self._swap(ret_doc)
                    results.append(ret_doc)
                    if not return_multiple: break
            except:
                # Exception is thrown if Response is in some invalid state (no hits, hits are empty)
                logging.warning("Invalid Response object: {}".format(
                    res.to_dict()))
                results.append(None)
                continue
        return results
Example #7
0
def simple_search_public_data(query_text):
    result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
    index_list = ['experiments', 'dataset', 'datafile']
    ms = MultiSearch(index=index_list)
    query_exp = Q("match", title=query_text)
    query_exp_oacl = Q("term", public_access=100)
    query_exp = query_exp & query_exp_oacl
    ms = ms.add(Search(index='experiments')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_exp))
    query_dataset = Q("match", description=query_text)
    query_dataset_oacl = Q("term", **{'experiments.public_access': 100})
    ms = ms.add(Search(index='dataset')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset)
                .query('nested', path='experiments', query=query_dataset_oacl))
    query_datafile = Q("match", filename=query_text)
    query_datafile_oacl = Q("term", experiments__public_access=100)
    query_datafile = query_datafile & query_datafile_oacl
    ms = ms.add(Search(index='datafile')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_datafile))
    results = ms.execute()
    for item in results:
        for hit in item.hits.hits:
            if hit["_index"] == "dataset":
                result_dict["datasets"].append(hit.to_dict())

            elif hit["_index"] == "experiments":
                result_dict["experiments"].append(hit.to_dict())

            elif hit["_index"] == "datafile":
                result_dict["datafiles"].append(hit.to_dict())
    return result_dict
Example #8
0
    def get_object_list(self, request):
        user = request.user
        query_text = request.GET.get('query', None)
        if not user.is_authenticated:
            result_dict = simple_search_public_data(query_text)
            return [SearchObject(id=1, hits=result_dict)]
        groups = user.groups.all()
        index_list = ['experiments', 'dataset', 'datafile']
        ms = MultiSearch(index=index_list)

        query_exp = Q("match", title=query_text)
        query_exp_oacl = Q("term", objectacls__entityId=user.id) | \
            Q("term", public_access=100)
        for group in groups:
            query_exp_oacl = query_exp_oacl | \
                                 Q("term", objectacls__entityId=group.id)
        query_exp = query_exp & query_exp_oacl
        ms = ms.add(
            Search(index='experiments').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_exp))

        query_dataset = Q("match", description=query_text)
        query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \
            Q("term", **{'experiments.public_access': 100})
        for group in groups:
            query_dataset_oacl = query_dataset_oacl | \
                                 Q("term", **{'experiments.objectacls.entityId': group.id})
        ms = ms.add(
            Search(index='dataset').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_dataset).query(
                    'nested', path='experiments', query=query_dataset_oacl))

        query_datafile = Q("match", filename=query_text)
        query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \
            Q("term", experiments__public_access=100)
        for group in groups:
            query_datafile_oacl = query_datafile_oacl | \
                                 Q("term", experiments__objectacls__entityId=group.id)
        query_datafile = query_datafile & query_datafile_oacl
        ms = ms.add(
            Search(index='datafile').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_datafile))
        results = ms.execute()
        result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
        for item in results:
            for hit in item.hits.hits:
                if hit["_index"] == "dataset":
                    result_dict["datasets"].append(hit.to_dict())

                elif hit["_index"] == "experiments":
                    result_dict["experiments"].append(hit.to_dict())

                elif hit["_index"] == "datafile":
                    result_dict["datafiles"].append(hit.to_dict())

        return [SearchObject(id=1, hits=result_dict)]
Example #9
0
class Getdata():
    es = Elasticsearch(hosts='http://124.49.54.38:19200/')
    search = Search(using=es,
                    index='test-2018.09.11').params(request_timeout=30)
    multi = MultiSearch(using=es,
                        index='test-2018.09.11').params(request_timeout=30)
    res = search.execute()

    total = search.count()
Example #10
0
    def get_multi_search(self):
        multi_search = MultiSearch()
        search = self.get_search()
        multi_search = multi_search.add(search)

        if self.args.get(constants.PARAM_AGGREGATIONS) is not None:
            multi_search = self.add_terms_aggregations(multi_search)

        return multi_search
Example #11
0
def multi_search(searchs):
    ms = MultiSearch(using=conn, index="log-index")

    for search in searchs:
        ms = ms.add(search)

    response = ms.execute()

    return response
def select_fields(all_fields, search, number_of_groups):
    '''
    Selects the fields from the given Fields object which are most common across the given
    resource ids. The search parameter is used to limit the records that contribute fields to the
    returned selection. The fields returned must appear in the search in at least one resource with
    at least one value present.

    :param all_fields: a Fields object
    :param search: an elasticsearch-dsl search object
    :param number_of_groups: the number of groups to select from the Fields object and return
    :return: a list of groups, each group is a dict containing:
                - "group" - the group name
                - "count" - the number of resources its fields appear in
                - "records" - the number of records the group's fields appear in
                - "fields" - the fields that make up the group along with the resource ids they come
                             from
                - "forced" - whether the field was forced into being included, or whether it was
                             included organically
    '''
    selected_fields = []
    # make sure we don't get any hits back, we're only interested in the counts
    search = search.extra(size=0)

    # iterate over the groups and searches in chunks
    for chunk in chunk_iterator(all_fields.get_searches(search),
                                chunk_size=number_of_groups):
        groups, searches = zip(*chunk)
        # create a multisearch for all the searches in the group
        multisearch = MultiSearch(using=common.ES_CLIENT)
        for search in searches:
            multisearch = multisearch.add(search)

        for (group, count, fields), response in zip(groups,
                                                    multisearch.execute()):
            if all_fields.is_forced(group) or response.hits.total > 0:
                # a field from this group has values in the search result, add it to the selection
                selected_fields.append(
                    dict(group=group,
                         count=count,
                         records=response.hits.total,
                         fields=fields,
                         forced=all_fields.is_forced(group)))

        if len(selected_fields) >= number_of_groups:
            break

    def group_sorter(the_group):
        # this sorts the groups ensuring forced groups are first, in the order they were forced,
        # then the groups with highest count and then the ones with the highest number of records
        if the_group[u'forced']:
            # use 0 0 to ensure that the base order of the groups is maintained for forced groups
            return True, 0, 0
        else:
            return False, the_group[u'count'], the_group[u'records']

    # sort the returned selected list by count and secondly records
    return sorted(selected_fields, key=group_sorter, reverse=True)
Example #13
0
def es_create_result_csv_bulk(name, index, result_size=200, batch_size=1000):
    start_time = time.time()
    index_size = Search(index=index).count()
    rest = index_size % batch_size
    results = []
    for i in range(0, index_size - rest, batch_size):
        multisearch = MultiSearch(index=index)
        print(f'generating results number {i} to {i + batch_size}')
        for item in range(i, i + batch_size):
            multisearch = multisearch.add(
                create_mlt_with_id(item, index, result_size))
        responses = multisearch.execute()
        for index_id, response in enumerate(responses, start=i):
            results.append(
                [str(index_id)] +
                [f'{hit.meta.id} ({hit.meta.score})' for hit in response])
    if rest:
        multisearch = MultiSearch(index=index)
        for i in range(index_size - rest, index_size):
            multisearch = multisearch.add(
                create_mlt_with_id(item, index, result_size))
        responses = multisearch.execute()
        for index_id, response in enumerate(responses, start=i):
            results.append(
                [str(index_id)] +
                [f'{hit.meta.id} ({hit.meta.score})' for hit in response])
    try:
        os.mkdir(f'{faiss_path}/{name}/')
    except FileExistsError:
        print(f'directory already exists and I am just deleting it.')
        shutil.rmtree(f'{faiss_path}/{name}/')
        os.mkdir(f'{faiss_path}/{name}/')
    with open(f'{faiss_path}/{name}/search_rankings.csv', 'w',
              newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for line in results:
            wr.writerow(line)
    stop_time = time.time() - start_time
    with open(f'./datasets/elasticsearch_{name}_timing', 'a') as f:
        f.write(f'time for generating es results for {name}: {stop_time}\n')
    return stop_time
Example #14
0
 def __init__(self, es, index, limit=10, q=None, filter=None):
   self.search = list()#Search(using=es, index=index)
   self.msearch = MultiSearch(using=es, index=index)
   self.queries = list()
   self.num_segs = 0
   self.limit = limit
   # Build queries
   if isinstance(q, str):
     q = [q]
   self._build(q)
   # Build filter(s)
   if isinstance(filter, dict):
     filter = [filter]
   self._filter(filter, es, index)
Example #15
0
    def build(self, q=None, **options):
        """
        Build a query according to q and options.
        This is the public method called by API handlers.

        Regarding scopes:
            scopes: [str] nonempty, match query.
            scopes: NoneType, or [], no scope, so query string query.

        Additionally support these options:
            explain: include es scoring information
            userquery: customized function to interpret q

        * additional keywords are passed through as es keywords
            for example: 'explain', 'version' ...

        * multi-search is supported when q is a list. all queries
            are built individually and then sent in one request.

        """
        options = dotdict(options)

        if options.scroll_id:
            # bypass all query building stages
            return ESScrollID(options.scroll_id)

        if options.fetch_all:
            # clean up conflicting parameters
            options.pop('sort', None)
            options.pop('size', None)

        try:
            # process single q vs list of q(s).
            # dispatch 'val' vs 'key:val' to corresponding functions.

            if isinstance(q, list):
                search = MultiSearch()
                for _q in q:
                    _search = self._build_one(_q, options)
                    search = search.add(_search)
            else:  # str, int ...
                search = self._build_one(q, options)

        except IllegalOperation as exc:
            raise ValueError(str(exc))  # ex. sorting by -_score

        if options.get('rawquery'):
            raise RawQueryInterrupt(search.to_dict())

        return search
Example #16
0
def get_usernames_for_crawl():
    ms = MultiSearch(index='populars')
    q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}})
    never_updated = Search().query(q)
    total = never_updated.count()
    never_updated = never_updated[0:total]
    old_updated = Search().query('range', last_update={"lte": "now-2d"})
    total = old_updated.count()
    old_updated = old_updated[0:total]
    ms = ms.add(never_updated)
    ms = ms.add(old_updated)
    responses = ms.execute()
    for res in responses:
        for hit in res:
            yield (hit.username)
Example #17
0
def multisearch(*models, **params):
    ms = MultiSearch(using=es.client, index=es.index_name)
    queries = []
    for model in models:
        s = search_for(model, **params)
        ms = ms.add(s._s)
        queries.append(s)
    responses = ms.execute()
    return [
        # _d_ is the only way to access the raw data
        # allowing to rewrap response in a FacetedSearch
        # because default multisearch loose facets
        SearchResult(query, response._d_)
        for query, response in zip(queries, responses)
    ]
Example #18
0
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(doc_type='commits')

    ms = MultiSearch(index='git')
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1.search is s1

    assert 52 == r2.hits.total
    assert r2.search is s2
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(index="flat-git")

    ms = MultiSearch()
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total.value
    assert r2._search is s2
Example #20
0
    def get(self, request, *args, **kwargs):

        query = request.GET.get('q')
        coords = [
            request.GET.get('latitude'),
            request.GET.get('longitude'),
            request.GET.get('radius')
        ]
        latitude, longitude, radius = get_user_coordinates(coords, request)
        ms = MultiSearch(index=['restaurants', 'categories'])

        if query:
            cs = CategoryDocument.search().query("query_string",
                                                 query=query,
                                                 default_field="label")
            q = Q('query_string', query=query, default_field='name')
            q |= Q('nested',
                   path='categories',
                   query=Q('query_string',
                           query=query,
                           default_field='categories.label'))
            rs = RestaurantDocument.search().filter('geo_distance',
                                                    distance='%smi' % radius,
                                                    location={
                                                        "lat": latitude,
                                                        "lon": longitude
                                                    }).query(q)

            ms = ms.add(cs)
            ms = ms.add(rs)
            responses = ms.execute()

            aggregate = []

            for response in responses:
                hits = response['hits']['hits']
                aggregate += [hit.to_dict() for hit in hits]

        else:
            cs = CategoryDocument.search().source([])
            cs = cs[0:10]
            response = cs.execute()
            hits = response['hits']['hits']
            aggregate = [hit.to_dict() for hit in hits]

        return Response(aggregate)
Example #21
0
    def query(self, queries, size, record_fnum):
        ms = MultiSearch(using=self.es, index=self.index_name)
        for q in queries:
            s = Search().query("match", userid=q[0]).query("match", record=q[1])[:size]
            ms = ms.add(s)
        responses = ms.execute()

        res_batch = []
        for response in responses:
            res = []
            for hit in response:
                res.append([int(hit.userid)] + list(map(int, hit.record.split(','))))
            if len(res) < size:
                res += [np.zeros([record_fnum,]).astype(np.int32).tolist()] * (size - len(res))
            res_batch.append(res)
        return res_batch
        
Example #22
0
    def get_queryset(self, queryset, data):
        phrase = data.get('q')

        if 'models' not in data:
            models = self._supported_models
        else:
            models = data['models'].split(',')

        advanced = data.get('advanced')
        op, suffix = get_advanced_options(advanced)
        lang = get_language()

        per_model = data.get('per_model', 1)
        ms = MultiSearch(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)

        for model in models:
            if is_enabled('S39_filter_by_geodata.be'
                          ) and model in self._completion_models:
                sug_query = Search(index=f'{model}s')
                sug_query = sug_query.suggest('title',
                                              phrase,
                                              completion={
                                                  'field':
                                                  f'title.{lang}.suggest',
                                                  'size': per_model
                                              })
                res = sug_query.execute()
                suggestions = res.suggest['title'][0]
                ids = [sug['_id'] for sug in suggestions['options']]
                query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)
                query = query.filter('term', model=model).query('ids',
                                                                values=ids)
            else:
                query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)
                query = query.filter('term', model=model)
                query = query.query('bool',
                                    should=[
                                        nested_query_with_advanced_opts(
                                            phrase, field, lang, op, suffix)
                                        for field in ('title', 'notes')
                                    ])
                query = query.extra(size=per_model)
            ms = ms.add(query)

        return ms
Example #23
0
    def build(self, q, **options):

        # NOTE
        # GRAPH QUERY CUSTOMIZATION

        # ONE
        if isinstance(q, GraphQuery):
            return self.build_graph_query(q, **options)

        # MULTI
        elif isinstance(q, GraphQueries):
            search = MultiSearch()
            for _q in q:
                search = search.add(self.build_graph_query(_q, **options))
            return search

        else:  # NOT GRAPH
            return super().build(q, **options)
Example #24
0
def multi_search(request):
	client = Elasticsearch()
	q = request.GET.get('q')
	if q:
		ms = MultiSearch(using=client, index="esdocument-index")
		ms = ms.add(Search().query("match", author=q))
		ms = ms.add(Search().query("match", title=q))
		ms = ms.add(Search().query("match", json_object=q))
		responses = ms.execute()
		hits = []
		for response in responses:
			for hit in response:
				hit = hit.title
				hits.append(hit)
	else:
		responses = 'empty'

	return render(request, 'elasticsearchapp/search.html',
		{'responses': responses, 'hits': hits})
Example #25
0
    def _get_parsed_data(self):
        # Error will be set to true if we encounter an error
        parsed_data = dict(raw=[], error=False, data=[])
        source = ElasticsearchSource.objects.get(name=self.source.name)
        multisearch = MultiSearch()

        if source.max_concurrent_searches is not None:
            multisearch.params(
                max_concurrent_searches=source.max_concurrent_searches)

        for query in json.loads(self.queries):
            multisearch = multisearch.add(
                Search.from_dict(query).params(ignore_unavailable=True,
                                               allow_no_indices=True))

        try:
            responses = multisearch.using(source.client).index(
                source.index).execute()

            for response in responses:
                raw_data = response.to_dict()
                parsed_data['raw'].append(raw_data)

                if raw_data['hits']['hits'] == []:
                    continue

                self._check_response_size(raw_data)

                data = self._parse_es_response([raw_data['aggregations']])
                if data == []:
                    continue

                parsed_data['data'].extend(data)

        except Exception as e:
            logger.exception(
                'Error executing Elasticsearch queries: {}'.format(
                    self.queries))
            parsed_data['error_code'] = type(e).__name__
            parsed_data['error_message'] = six.text_type(e)
            parsed_data['error'] = True

        return parsed_data
Example #26
0
    def execute_searches(self):
        """Ejecuta la query de todas las series agregadas, e inicializa
        los atributos data y count a partir de las respuestas.
        """

        multi_search = MultiSearch(index=self.index,
                                   doc_type=settings.TS_DOC_TYPE)

        for serie in self.series:
            multi_search = multi_search.add(serie.search)

        responses = multi_search.execute()
        formatter = ResponseFormatter(self.series, responses,
                                      self.args[constants.PARAM_SORT],
                                      self.args[constants.PARAM_PERIODICITY])

        return {
            'data': (formatter.format_response()),
            'count': max([response.hits.total for response in responses])
        }
Example #27
0
    def execute_searches(self):
        """Ejecuta la query de todas las series agregadas, e inicializa
        los atributos data y count a partir de las respuestas.
        """
        if not self.series:
            raise QueryError(strings.EMPTY_QUERY_ERROR)

        multi_search = MultiSearch(index=self.index,
                                   doc_type=settings.TS_DOC_TYPE,
                                   using=self.elastic)

        for serie in self.series:
            serie.add_collapse(self.args[constants.PARAM_PERIODICITY])
            multi_search = multi_search.add(serie.search)

        responses = multi_search.execute()
        formatter = ResponseFormatter(self.series, responses, self.args)
        self.data = formatter.format_response()

        self.count = max([response.hits.total for response in responses])
def calculate_field_counts(request, es_client):
    '''
    Given a download request and an elasticsearch client to work with, work out the number of values
    available per field, per resource for the search.

    :param request: the DownloadRequest object
    :param es_client: the elasticsearch client to use
    :return: a dict of resource ids -> fields -> counts
    '''
    field_counts = defaultdict(dict)
    for resource_id, version in request.resource_ids_and_versions.items():
        index_name = prefix_resource(resource_id)
        # get the base field mapping for the index so that we know which fields to look up, this
        # will get all fields from all versions and therefore isn't usable straight off the bat, we
        # have to then go and see which fields are present in the search at this version
        mapping = es_client.indices.get_mapping(index_name)[index_name]

        # we're going to do a multisearch to find out the number of records a value for each field
        # from the mapping
        search = MultiSearch(using=es_client, index=index_name)
        base_search = Search.from_dict(request.search) \
            .index(index_name) \
            .using(es_client) \
            .extra(size=0) \
            .filter(create_version_query(version))

        # get all the fields names and use dot notation for nested fields
        fields = [
            u'.'.join(parts) for parts, _config in iter_data_fields(mapping)
        ]
        for field in fields:
            # add a search which finds the documents that have a value for the given field at the
            # right version
            search = search.add(
                base_search.filter(u'exists', field=prefix_field(field)))

        responses = search.execute()
        for field, response in zip(fields, responses):
            field_counts[resource_id][field] = response.hits.total

    return field_counts
def find_searched_resources(search, resource_ids):
    '''
    Given a search and a list of resource ids to search in, returns a list of the resources that are
    actually included in the search results.

    :param search: an elasticsearch-dsl object
    :param resource_ids: a list of resource ids
    :return: a list of resource ids
    '''
    # we have to make a copy as aggs don't return a clone :(
    search_copy = copy(search)
    search_copy = search_copy.index(
        [prefix_resource(resource_id) for resource_id in resource_ids])
    search_copy.aggs.bucket(u'indexes', u'terms', field=u'_index')
    multisearch = MultiSearch(using=common.ES_CLIENT).add(search_copy)
    result = next(iter(multisearch.execute()))
    return [
        trim_index_name(bucket[u'key'])
        for bucket in result.aggs.to_dict()[u'indexes'][u'buckets']
        if bucket[u'doc_count'] > 0
    ]
Example #30
0
def test_multi_missing(data_client):
    s1 = Repository.search()
    s2 = Search(doc_type='commits')
    s3 = Search(index='does_not_exist')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2).add(s3)

    with raises(TransportError):
        ms.execute()

    r1, r2, r3 = ms.execute(raise_on_error=False)

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1.search is s1

    assert 52 == r2.hits.total
    assert r2.search is s2

    assert r3 is None