Example #1
0
    def mcount_buckets(self, buckets):
        ms = MultiSearch(using=self.es)
        for bucket_name in buckets:
            search = Search(using=self.es,
                            index="{}*".format(TMUtils.MAP_PREFIX))
            search.aggs.bucket('indexes', 'terms', field='_index',
                               size=999999).bucket('values',
                                                   'terms',
                                                   field=bucket_name,
                                                   size=999999)
            ms = ms.add(search)

        mres = ms.execute()

        lang2buckets = dict()
        for bucket_name, res in zip(buckets, mres):
            if hasattr(res, "aggregations") and 'indexes' in res.aggregations:
                triple_list = [(re.sub("^{}".format(TMUtils.MAP_PREFIX), "",
                                       x.key), y.key, y.doc_count)
                               for x in res.aggregations['indexes'].buckets
                               for y in x['values'].buckets]
                for lang_pair, bucket_value, count in triple_list:
                    lang2buckets.setdefault(lang_pair, dict()).setdefault(
                        bucket_name, dict())[bucket_value] = count

        return lang2buckets
Example #2
0
def _run_multisearch(es, searches):
    """Ejecuta una lista de búsquedas Elasticsearch utilizando la función
    MultiSearch. La cantidad de búsquedas que se envían a la vez es
    configurable vía la variable ES_MULTISEARCH_MAX_LEN.

    Args:
        es (Elasticsearch): Conexión a Elasticsearch.
        searches (list): Lista de elasticsearch_dsl.Search.

    Raises:
        DataConnectionException: Si ocurrió un error al ejecutar las búsquedas.

    Returns:
        list: Lista de respuestas a cada búsqueda.

    """
    step_size = constants.ES_MULTISEARCH_MAX_LEN
    responses = []

    # Partir las búsquedas en varios baches si es necesario.
    for i in range(0, len(searches), step_size):
        end = min(i + step_size, len(searches))
        ms = MultiSearch(using=es)

        for j in range(i, end):
            ms = ms.add(searches[j])

        try:
            responses.extend(ms.execute(raise_on_error=True))
        except elasticsearch.ElasticsearchException as e:
            raise DataConnectionException() from e

    return responses
Example #3
0
class MultiSearch(object):
    def __init__(self, index=None, queries=None):
        self.index = index
        self._queries = BaseMultiSearch(
            index=self.index._meta.index if index else None)

        for query in queries or []:
            self.add(query)

    def raw(self, raw_dict):
        return Search().raw(raw_dict)

    def filter(self, *args, **kw):
        return Search().filter(*args, **kw)

    def query(self, *args, **kw):
        return Search().query(*args, **kw)

    def add(self, *queries):
        for query in queries:
            self._queries = self._queries.add(query)

    def execute(self):
        return self._queries.execute()

    def __iter__(self):
        return iter(self.execute())

    def __len__(self):
        return len(self._queries)
Example #4
0
    def mget(self, id_langs, return_multiple=False):
        if not id_langs: return []
        msearch = MultiSearch(using=self.es)
        search_swap = []
        for source_id, source_lang, target_lang in id_langs:
            search, swap = self._create_search(source_id, source_lang,
                                               target_lang)
            if search:
                # Sort by update date so in case of multiple segments having the same source, the latest one will be returned
                search = search.sort('-update_date')
                msearch = msearch.add(search)
                search_swap.append(swap)

        responses = msearch.execute()
        results = []
        for res, swap in zip(responses, search_swap):
            try:
                if not 'hits' in res or not res.hits.total:
                    results.append(None)
                    continue
                for ret_doc in res.hits:
                    # Exchange source and target (if needed)
                    if swap: ret_doc = self._swap(ret_doc)
                    results.append(ret_doc)
                    if not return_multiple: break
            except:
                # Exception is thrown if Response is in some invalid state (no hits, hits are empty)
                logging.warning("Invalid Response object: {}".format(
                    res.to_dict()))
                results.append(None)
                continue
        return results
Example #5
0
    def mexist(self, src_lang, src_ids):
        if not src_ids: return []
        tgt_langs = [
            target_lang for target_lang in self.lang_graph.neighbors(src_lang)
        ]

        MEXIST_BATCH_SIZE = 10
        results = []
        for i in range(0, len(src_ids), MEXIST_BATCH_SIZE):
            msearch = MultiSearch(using=self.es)
            for source_id in src_ids[i:i + MEXIST_BATCH_SIZE]:
                search = self._create_search_mindexes(source_id, src_lang,
                                                      tgt_langs)
                if search:
                    msearch = msearch.add(search)
            responses = msearch.execute()
            for res in responses:
                try:
                    results.append(bool('hits' in res and res.hits.total))
                except:
                    # Exception is thrown if Response is in some invalid state (no hits, hits are empty)
                    logging.warning("Invalid Response object: {}".format(
                        res.to_dict()))
                    results.append(None)
        return results
Example #6
0
def simple_search_public_data(query_text):
    result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
    index_list = ['experiments', 'dataset', 'datafile']
    ms = MultiSearch(index=index_list)
    query_exp = Q("match", title=query_text)
    query_exp_oacl = Q("term", public_access=100)
    query_exp = query_exp & query_exp_oacl
    ms = ms.add(Search(index='experiments')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_exp))
    query_dataset = Q("match", description=query_text)
    query_dataset_oacl = Q("term", **{'experiments.public_access': 100})
    ms = ms.add(Search(index='dataset')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset)
                .query('nested', path='experiments', query=query_dataset_oacl))
    query_datafile = Q("match", filename=query_text)
    query_datafile_oacl = Q("term", experiments__public_access=100)
    query_datafile = query_datafile & query_datafile_oacl
    ms = ms.add(Search(index='datafile')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_datafile))
    results = ms.execute()
    for item in results:
        for hit in item.hits.hits:
            if hit["_index"] == "dataset":
                result_dict["datasets"].append(hit.to_dict())

            elif hit["_index"] == "experiments":
                result_dict["experiments"].append(hit.to_dict())

            elif hit["_index"] == "datafile":
                result_dict["datafiles"].append(hit.to_dict())
    return result_dict
Example #7
0
    def _fetch_word_freqs_per_day(
        self,
        dataset_widget: DatasetWidget,
    ) -> Tuple[Mapping[str, Sequence[int]], Sequence[int], int]:
        _LOGGER.debug("Fetching word frequencies per day.")

        search_helper = SearchHelper(dataset_widget.dataset.type)
        search_template = Search().extra(size=0, track_total_hits=True)
        search_template = dataset_widget.set_search(search_template)
        search_template = search_helper.add_agg_text_tokens_terms(
            search_template, size=self._top_n_words)

        search = MultiSearch()
        for cur_date in date_range(self._min_date, self._max_date):
            search = search.add(
                search_template.filter(
                    search_helper.query_date_range(gte=cur_date,
                                                   lt=cur_date +
                                                   timedelta(days=1))))

        time_before = time()
        responses = search.execute()
        time_after = time()
        took_msecs = int((time_after - time_before) * 1000)

        word_freqs = defaultdict(lambda: [0] * len(responses))
        num_docs = []
        for i, response in enumerate(responses):
            num_docs.append(response.hits.total.value)
            for bucket in search_helper.read_agg_text_tokens_terms(response):
                word_freqs[bucket.key][i] = bucket.doc_count

        return word_freqs, num_docs, took_msecs
Example #8
0
def run_searches(es, index, searches):
    """Ejecuta una lista de búsquedas Elasticsearch. Internamente, se utiliza
    la función MultiSearch.

    Args:
        es (Elasticsearch): Conexión a Elasticsearch.
        index (str): Nombre del índice sobre el cual se deberían ejecutar las
            queries.
        searches (list): Lista de búsquedas, de tipo Search.

    Raises:
        DataConnectionException: si ocurrió un error al ejecutar las búsquedas.

    Returns:
        list: Lista de resultados, cada resultado contiene una lista de 'hits'
            (documentos encontrados).

    """
    ms = MultiSearch(index=index, using=es)

    for search in searches:
        ms = ms.add(search)

    try:
        responses = ms.execute(raise_on_error=True)

        return [[hit.to_dict() for hit in response.hits]
                for response in responses]
    except elasticsearch.ElasticsearchException:
        raise DataConnectionException()
Example #9
0
    def get_object_list(self, request):
        user = request.user
        query_text = request.GET.get('query', None)
        if not user.is_authenticated:
            result_dict = simple_search_public_data(query_text)
            return [SearchObject(id=1, hits=result_dict)]
        groups = user.groups.all()
        index_list = ['experiments', 'dataset', 'datafile']
        ms = MultiSearch(index=index_list)

        query_exp = Q("match", title=query_text)
        query_exp_oacl = Q("term", objectacls__entityId=user.id) | \
            Q("term", public_access=100)
        for group in groups:
            query_exp_oacl = query_exp_oacl | \
                                 Q("term", objectacls__entityId=group.id)
        query_exp = query_exp & query_exp_oacl
        ms = ms.add(
            Search(index='experiments').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_exp))

        query_dataset = Q("match", description=query_text)
        query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \
            Q("term", **{'experiments.public_access': 100})
        for group in groups:
            query_dataset_oacl = query_dataset_oacl | \
                                 Q("term", **{'experiments.objectacls.entityId': group.id})
        ms = ms.add(
            Search(index='dataset').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_dataset).query(
                    'nested', path='experiments', query=query_dataset_oacl))

        query_datafile = Q("match", filename=query_text)
        query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \
            Q("term", experiments__public_access=100)
        for group in groups:
            query_datafile_oacl = query_datafile_oacl | \
                                 Q("term", experiments__objectacls__entityId=group.id)
        query_datafile = query_datafile & query_datafile_oacl
        ms = ms.add(
            Search(index='datafile').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_datafile))
        results = ms.execute()
        result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
        for item in results:
            for hit in item.hits.hits:
                if hit["_index"] == "dataset":
                    result_dict["datasets"].append(hit.to_dict())

                elif hit["_index"] == "experiments":
                    result_dict["experiments"].append(hit.to_dict())

                elif hit["_index"] == "datafile":
                    result_dict["datafiles"].append(hit.to_dict())

        return [SearchObject(id=1, hits=result_dict)]
Example #10
0
def multi_search(searchs):
    ms = MultiSearch(using=conn, index="log-index")

    for search in searchs:
        ms = ms.add(search)

    response = ms.execute()

    return response
def select_fields(all_fields, search, number_of_groups):
    '''
    Selects the fields from the given Fields object which are most common across the given
    resource ids. The search parameter is used to limit the records that contribute fields to the
    returned selection. The fields returned must appear in the search in at least one resource with
    at least one value present.

    :param all_fields: a Fields object
    :param search: an elasticsearch-dsl search object
    :param number_of_groups: the number of groups to select from the Fields object and return
    :return: a list of groups, each group is a dict containing:
                - "group" - the group name
                - "count" - the number of resources its fields appear in
                - "records" - the number of records the group's fields appear in
                - "fields" - the fields that make up the group along with the resource ids they come
                             from
                - "forced" - whether the field was forced into being included, or whether it was
                             included organically
    '''
    selected_fields = []
    # make sure we don't get any hits back, we're only interested in the counts
    search = search.extra(size=0)

    # iterate over the groups and searches in chunks
    for chunk in chunk_iterator(all_fields.get_searches(search),
                                chunk_size=number_of_groups):
        groups, searches = zip(*chunk)
        # create a multisearch for all the searches in the group
        multisearch = MultiSearch(using=common.ES_CLIENT)
        for search in searches:
            multisearch = multisearch.add(search)

        for (group, count, fields), response in zip(groups,
                                                    multisearch.execute()):
            if all_fields.is_forced(group) or response.hits.total > 0:
                # a field from this group has values in the search result, add it to the selection
                selected_fields.append(
                    dict(group=group,
                         count=count,
                         records=response.hits.total,
                         fields=fields,
                         forced=all_fields.is_forced(group)))

        if len(selected_fields) >= number_of_groups:
            break

    def group_sorter(the_group):
        # this sorts the groups ensuring forced groups are first, in the order they were forced,
        # then the groups with highest count and then the ones with the highest number of records
        if the_group[u'forced']:
            # use 0 0 to ensure that the base order of the groups is maintained for forced groups
            return True, 0, 0
        else:
            return False, the_group[u'count'], the_group[u'records']

    # sort the returned selected list by count and secondly records
    return sorted(selected_fields, key=group_sorter, reverse=True)
Example #12
0
def es_create_result_csv_bulk(name, index, result_size=200, batch_size=1000):
    start_time = time.time()
    index_size = Search(index=index).count()
    rest = index_size % batch_size
    results = []
    for i in range(0, index_size - rest, batch_size):
        multisearch = MultiSearch(index=index)
        print(f'generating results number {i} to {i + batch_size}')
        for item in range(i, i + batch_size):
            multisearch = multisearch.add(
                create_mlt_with_id(item, index, result_size))
        responses = multisearch.execute()
        for index_id, response in enumerate(responses, start=i):
            results.append(
                [str(index_id)] +
                [f'{hit.meta.id} ({hit.meta.score})' for hit in response])
    if rest:
        multisearch = MultiSearch(index=index)
        for i in range(index_size - rest, index_size):
            multisearch = multisearch.add(
                create_mlt_with_id(item, index, result_size))
        responses = multisearch.execute()
        for index_id, response in enumerate(responses, start=i):
            results.append(
                [str(index_id)] +
                [f'{hit.meta.id} ({hit.meta.score})' for hit in response])
    try:
        os.mkdir(f'{faiss_path}/{name}/')
    except FileExistsError:
        print(f'directory already exists and I am just deleting it.')
        shutil.rmtree(f'{faiss_path}/{name}/')
        os.mkdir(f'{faiss_path}/{name}/')
    with open(f'{faiss_path}/{name}/search_rankings.csv', 'w',
              newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        for line in results:
            wr.writerow(line)
    stop_time = time.time() - start_time
    with open(f'./datasets/elasticsearch_{name}_timing', 'a') as f:
        f.write(f'time for generating es results for {name}: {stop_time}\n')
    return stop_time
Example #13
0
def test_multi_missing(data_client):
    s1 = Repository.search()
    s2 = Search(doc_type='commits')
    s3 = Search(index='does_not_exist')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2).add(s3)

    with raises(TransportError):
        ms.execute()

    r1, r2, r3 = ms.execute(raise_on_error=False)

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1.search is s1

    assert 52 == r2.hits.total
    assert r2.search is s2

    assert r3 is None
def test_multi_missing(data_client):
    s1 = Repository.search()
    s2 = Search(index='flat-git')
    s3 = Search(index='does_not_exist')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2).add(s3)

    with raises(TransportError):
        ms.execute()

    r1, r2, r3 = ms.execute(raise_on_error=False)

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total
    assert r2._search is s2

    assert r3 is None
Example #15
0
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(doc_type='commits')

    ms = MultiSearch(index='git')
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1.search is s1

    assert 52 == r2.hits.total
    assert r2.search is s2
Example #16
0
def get_usernames_for_crawl():
    ms = MultiSearch(index='populars')
    q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}})
    never_updated = Search().query(q)
    total = never_updated.count()
    never_updated = never_updated[0:total]
    old_updated = Search().query('range', last_update={"lte": "now-2d"})
    total = old_updated.count()
    old_updated = old_updated[0:total]
    ms = ms.add(never_updated)
    ms = ms.add(old_updated)
    responses = ms.execute()
    for res in responses:
        for hit in res:
            yield (hit.username)
Example #17
0
def multisearch(*models, **params):
    ms = MultiSearch(using=es.client, index=es.index_name)
    queries = []
    for model in models:
        s = search_for(model, **params)
        ms = ms.add(s._s)
        queries.append(s)
    responses = ms.execute()
    return [
        # _d_ is the only way to access the raw data
        # allowing to rewrap response in a FacetedSearch
        # because default multisearch loose facets
        SearchResult(query, response._d_)
        for query, response in zip(queries, responses)
    ]
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(index="flat-git")

    ms = MultiSearch()
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total.value
    assert r2._search is s2
def test_multi_search(data_client):
    s1 = Repository.search()
    s2 = Search(index='flat-git')

    ms = MultiSearch()
    ms = ms.add(s1).add(s2)

    r1, r2 = ms.execute()

    assert 1 == len(r1)
    assert isinstance(r1[0], Repository)
    assert r1._search is s1

    assert 52 == r2.hits.total
    assert r2._search is s2
Example #20
0
    def query(self, queries, size, record_fnum):
        ms = MultiSearch(using=self.es, index=self.index_name)
        for q in queries:
            s = Search().query("match", userid=q[0]).query("match", record=q[1])[:size]
            ms = ms.add(s)
        responses = ms.execute()

        res_batch = []
        for response in responses:
            res = []
            for hit in response:
                res.append([int(hit.userid)] + list(map(int, hit.record.split(','))))
            if len(res) < size:
                res += [np.zeros([record_fnum,]).astype(np.int32).tolist()] * (size - len(res))
            res_batch.append(res)
        return res_batch
        
Example #21
0
    def get(self, request, *args, **kwargs):

        query = request.GET.get('q')
        coords = [
            request.GET.get('latitude'),
            request.GET.get('longitude'),
            request.GET.get('radius')
        ]
        latitude, longitude, radius = get_user_coordinates(coords, request)
        ms = MultiSearch(index=['restaurants', 'categories'])

        if query:
            cs = CategoryDocument.search().query("query_string",
                                                 query=query,
                                                 default_field="label")
            q = Q('query_string', query=query, default_field='name')
            q |= Q('nested',
                   path='categories',
                   query=Q('query_string',
                           query=query,
                           default_field='categories.label'))
            rs = RestaurantDocument.search().filter('geo_distance',
                                                    distance='%smi' % radius,
                                                    location={
                                                        "lat": latitude,
                                                        "lon": longitude
                                                    }).query(q)

            ms = ms.add(cs)
            ms = ms.add(rs)
            responses = ms.execute()

            aggregate = []

            for response in responses:
                hits = response['hits']['hits']
                aggregate += [hit.to_dict() for hit in hits]

        else:
            cs = CategoryDocument.search().source([])
            cs = cs[0:10]
            response = cs.execute()
            hits = response['hits']['hits']
            aggregate = [hit.to_dict() for hit in hits]

        return Response(aggregate)
Example #22
0
def multi_search(request):
	client = Elasticsearch()
	q = request.GET.get('q')
	if q:
		ms = MultiSearch(using=client, index="esdocument-index")
		ms = ms.add(Search().query("match", author=q))
		ms = ms.add(Search().query("match", title=q))
		ms = ms.add(Search().query("match", json_object=q))
		responses = ms.execute()
		hits = []
		for response in responses:
			for hit in response:
				hit = hit.title
				hits.append(hit)
	else:
		responses = 'empty'

	return render(request, 'elasticsearchapp/search.html',
		{'responses': responses, 'hits': hits})
Example #23
0
    def execute_searches(self):
        """Ejecuta la query de todas las series agregadas, e inicializa
        los atributos data y count a partir de las respuestas.
        """
        if not self.series:
            raise QueryError(strings.EMPTY_QUERY_ERROR)

        multi_search = MultiSearch(index=self.index,
                                   doc_type=settings.TS_DOC_TYPE,
                                   using=self.elastic)

        for serie in self.series:
            serie.add_collapse(self.args[constants.PARAM_PERIODICITY])
            multi_search = multi_search.add(serie.search)

        responses = multi_search.execute()
        formatter = ResponseFormatter(self.series, responses, self.args)
        self.data = formatter.format_response()

        self.count = max([response.hits.total for response in responses])
Example #24
0
    def execute_searches(self):
        """Ejecuta la query de todas las series agregadas, e inicializa
        los atributos data y count a partir de las respuestas.
        """

        multi_search = MultiSearch(index=self.index,
                                   doc_type=settings.TS_DOC_TYPE)

        for serie in self.series:
            multi_search = multi_search.add(serie.search)

        responses = multi_search.execute()
        formatter = ResponseFormatter(self.series, responses,
                                      self.args[constants.PARAM_SORT],
                                      self.args[constants.PARAM_PERIODICITY])

        return {
            'data': (formatter.format_response()),
            'count': max([response.hits.total for response in responses])
        }
def calculate_field_counts(request, es_client):
    '''
    Given a download request and an elasticsearch client to work with, work out the number of values
    available per field, per resource for the search.

    :param request: the DownloadRequest object
    :param es_client: the elasticsearch client to use
    :return: a dict of resource ids -> fields -> counts
    '''
    field_counts = defaultdict(dict)
    for resource_id, version in request.resource_ids_and_versions.items():
        index_name = prefix_resource(resource_id)
        # get the base field mapping for the index so that we know which fields to look up, this
        # will get all fields from all versions and therefore isn't usable straight off the bat, we
        # have to then go and see which fields are present in the search at this version
        mapping = es_client.indices.get_mapping(index_name)[index_name]

        # we're going to do a multisearch to find out the number of records a value for each field
        # from the mapping
        search = MultiSearch(using=es_client, index=index_name)
        base_search = Search.from_dict(request.search) \
            .index(index_name) \
            .using(es_client) \
            .extra(size=0) \
            .filter(create_version_query(version))

        # get all the fields names and use dot notation for nested fields
        fields = [
            u'.'.join(parts) for parts, _config in iter_data_fields(mapping)
        ]
        for field in fields:
            # add a search which finds the documents that have a value for the given field at the
            # right version
            search = search.add(
                base_search.filter(u'exists', field=prefix_field(field)))

        responses = search.execute()
        for field, response in zip(fields, responses):
            field_counts[resource_id][field] = response.hits.total

    return field_counts
def find_searched_resources(search, resource_ids):
    '''
    Given a search and a list of resource ids to search in, returns a list of the resources that are
    actually included in the search results.

    :param search: an elasticsearch-dsl object
    :param resource_ids: a list of resource ids
    :return: a list of resource ids
    '''
    # we have to make a copy as aggs don't return a clone :(
    search_copy = copy(search)
    search_copy = search_copy.index(
        [prefix_resource(resource_id) for resource_id in resource_ids])
    search_copy.aggs.bucket(u'indexes', u'terms', field=u'_index')
    multisearch = MultiSearch(using=common.ES_CLIENT).add(search_copy)
    result = next(iter(multisearch.execute()))
    return [
        trim_index_name(bucket[u'key'])
        for bucket in result.aggs.to_dict()[u'indexes'][u'buckets']
        if bucket[u'doc_count'] > 0
    ]
Example #27
0
    def execute_queries(self, queries: Dict[Resource, Q],
                        page_index: int,
                        results_per_page: int) -> List[Response]:
        multisearch = MultiSearch(using=self.elasticsearch)

        for resource in queries.keys():
            query_for_resource = queries.get(resource)
            search = Search(index=self.get_index_for_resource(resource_type=resource)).query(query_for_resource)
            LOGGER.info(search.to_dict())
            # pagination
            start_from = page_index * results_per_page
            end = results_per_page * (page_index + 1)

            search = search[start_from:end]

            multisearch = multisearch.add(search)
        try:
            response = multisearch.execute()
            return response
        except Exception as e:
            LOGGER.error(f'Failed to execute ES search queries. {e}')
            return []
Example #28
0
def run_multiple_filters():
    while True:
        index = "book"
        ms = MultiSearch(index=index)
        ask_price_filter = AskPriceFilter(index)
        search_ask = ask_price_filter.main_query(
            gt_price=50, lt_price=52, from_range=10, to_range=20
        )
        ms = ms.add(search_ask)
        bid_price_filter = BidPriceFilter(index)
        search_bid = bid_price_filter.main_query(gt_price=50, lt_price=51, to_range=15)
        ms = ms.add(search_bid)

        responses = ms.execute()  # returns a list of Response objects
        for resp in responses:
            print(len(resp))
            print(resp.hits.total.value)

        ask_price_filter.show_result(from_range=0, to_range=5)
        bid_price_filter.show_result(from_range=0, to_range=5)

        time.sleep(5)
def update_jobs(entries, history=False):
    """
    Generate updates to claims.* from job classad dictionaries
    """

    def parent_slot_name(dynamic_slot_name):
        parts = dynamic_slot_name.split("@")
        match = re.match(r"(slot\d+)_\d+", parts[0])
        if match:
            parts[0] = match.group(1)
        return "@".join(parts)

    # MultiSearch will fail if there are no queries to run
    jobs = list(entries)
    if not jobs:
        return

    # glidein names are not necessarily unique on long time scales. look up the
    # last glidein that started with the advertised name _before_ the evicted
    # job was started
    ms = MultiSearch(using=es, index=options.indexname)
    for hit in jobs:
        if history:
            t0 = hit["JobCurrentStartDate"]
        else:
            if hit["JobStatus"] == 5:
                t0 = hit["JobCurrentStartDate"]
            else:
                t0 = hit["JobLastStartDate"]
        ms = ms.add(
            Search()
            .filter("term", Name__keyword=parent_slot_name(hit["LastRemoteHost"]))
            .filter("range", DaemonStartTime={"lte": datetime.utcfromtimestamp(t0)},)
            .sort({"DaemonStartTime": {"order": "desc"}})
            .source(["nuthin"])[:1]
        )

    for hit, match in zip(jobs, ms.execute()):
        if not match.hits:
            continue
        if history:
            if hit["JobStatus"] == 3:
                category = "removed"
            elif hit["ExitCode"] == 0:
                category = "finished"
            else:
                category = "failed"
            walltime = float(hit["EnteredCurrentStatus"] - hit["JobCurrentStartDate"])
        else:
            # NB: if a job is evicted from one slot, held on another, and then
            # removed from the queue, there's no way to recover the time that
            # may have elapsed between hold and removal. To handle this case,
            # we treat held jobs as a subcategory of removed jobs, so that they
            # will not be counted again when encountered in the history.
            if hit["JobStatus"] == 5:
                walltime = float(hit["EnteredCurrentStatus"] - hit["JobCurrentStartDate"])
                category = "removed"
            else:
                walltime = float(hit["LastVacateTime"] - hit["JobLastStartDate"])
                category = "evicted"

        # normalize capitalization of requests
        requests = {resource: 0 for resource in RESOURCES}
        for k in hit:
            if k.startswith("Request"):
                requests[k[7:]] = walltime * hit[k]

        doc = {
            "_op_type": "update",
            "_index": match.hits[0].meta.index,
            "_type": match.hits[0].meta.doc_type,
            "_id": match.hits[0].meta.id,
            "script": {
                "id": options.indexname + "-update-jobs",
                "params": {
                    "job": hit["GlobalJobId"].replace("#", "-").replace(".", "-"),
                    "category": category,
                    "requests": requests,
                },
            },
        }
        yield doc
Example #30
0
    def batch_request(cls, names):
        """
        Map all name fragments in the array to name hashes.

        Takes an array of arrays (names are tokenized) and returns
        hashes and labels from ES.
        """
        # TODO: THROW IT AWAY AND REPLACE WITH DAWG
        def search_clause(term):
            # TODO: case for initials
            return cls.search().filter("term", term=term)

        def transform_resp(resp):
            labels = list(set(resp.lemma_labels) - {"lemma"})
            assert len(labels) == 1

            label = {
                "lemma-firstname": "firstname",
                "lemma-patronymic": "patronymic",
                "lemma-lastname": "lastname",
                "lemma-firstname-typo": "firstname",
                "lemma-patronymic-typo": "patronymic",
                "lemma-lastname-typo": "lastname"
            }[labels[0]]

            return {
                "term": resp.term,
                "lemma": resp.lemma,
                "label": label
            }

        def match_req_resp(name, hashes):
            res = []

            for chunk, resp in zip(name, hashes):
                if resp:
                    res.append(list(map(transform_resp, resp)))
                else:
                    res.append([{
                        "lemma": sha1((chunk + "thisissalt").encode('utf-8')).hexdigest(),
                        "label": "no-match",
                        "term": chunk
                    }])
            return res

        qs = MultiSearch(index=cls._doc_type.index)
        for name in names:
            for chunk in name:
                qs = qs.add(search_clause(chunk))

        response = qs.execute()
        results = []

        pos = 0
        for name in names:
            l = len(name)

            res_chunk = match_req_resp(name, response[pos:pos + l])

            results.append(res_chunk)
            pos += l

        return results
    def extract_keyterms_and_contexts(self):
        """ Execute keyterms and highlight/offset queries

        Task 2: Extract keyterms and offsets (and optionally contexts). Add the corresponding ES/mongo updates
            to an update queue for indexing. Execute task over multiple threads to speed things up.
        """
        while self.task_manager.task_running('keyterm_query_fetching') or len(
                self.keyterms_query_batches) > 0:
            try:
                # Get the next batch
                batch = self.keyterms_query_batches.popleft()
            except IndexError:
                # Queue is empty; wait
                time.sleep(1)
                continue

            # Execute keyterms queries, getting the keyterms, offsets and contexts
            #  and adding the payloads to the updates queue
            if self.use_termvectors:
                # When we use termvectors, we can't extract highlight/offset information
                # from the anlayzed/tokenized field, and so we have to extract from the termvectors.
                termvecs = self.es_utility.es.mtermvectors(
                    ids=batch['_ids'],
                    index=self.es_index_name,
                    doc_type='_doc')
                # Since we don't know the order of the returned termvectors,
                #  we create a lookup from keyterms to their locations
                doc_termvector_lookup = {
                    x['_id']:
                    x['term_vectors'][self.es_highlight_field]['terms']
                    for x in termvecs['docs']
                }
                batch_toks_to_locs = []
                batch_contexts = []
                for i, (_id, tv_attrs) in enumerate(
                        zip(batch['_ids'], termvecs['docs'])):
                    keyterms_set = set(batch['keyterms'][i])
                    doc_tok_offsets, context = extract_termvectors_contexts_and_keyterm_offsets(
                        doc_termvector_lookup[_id], keyterms_set,
                        self.termvectors_window_size, self.extract_contexts)
                    batch_toks_to_locs.append(doc_tok_offsets)
                    batch_contexts.append(context)
            else:
                ms = MultiSearch(index=self.es_index_name).using(
                    self.es_utility.es)
                for _id, _keyterms in zip(batch['_ids'], batch['keyterms']):
                    offsets_search, highlight_search = get_context_and_offset_query(
                        _id, _keyterms, self.es_utility.es,
                        self.es_highlight_field)
                    ms = ms.add(offsets_search).add(highlight_search)
                resp = ms.execute(raise_on_error=True)
                batch_toks_to_locs, batch_contexts = extract_contexts_and_keyterm_offsets(
                    resp,
                    self.es_highlight_field,
                    extract_context=self.extract_contexts)
            if self.extract_contexts:
                for _id, keyterm_locs, contexts in zip(batch['_ids'],
                                                       batch_toks_to_locs,
                                                       batch_contexts):
                    if keyterm_locs is None:
                        keyterms = offsets = None
                    else:
                        keyterms, offsets = list(keyterm_locs.keys()), list(
                            keyterm_locs.values())
                    self.updates.append({
                        '_id': _id,
                        'body': {
                            self.keywords_field_name: keyterms,
                            self.offsets_field_name: offsets,
                            self.contexts_field_name: contexts
                        }
                    })
            else:
                for _id, keyterm_locs in zip(batch['_ids'],
                                             batch_toks_to_locs):
                    if keyterm_locs is None:
                        keyterms = offsets = None
                    else:
                        keyterms, offsets = list(keyterm_locs.keys()), list(
                            keyterm_locs.values())
                    self.updates.append({
                        '_id': _id,
                        'body': {
                            self.keywords_field_name: keyterms,
                            self.offsets_field_name: offsets
                        }
                    })
        # Notify that the thread has finished
        self.task_manager.add_completed('updates_extraction')
h = response.hits[0]
print('/%s/%s/%s returned with score %f' % (
    h.meta.index, h.meta.doc_type, h.meta.id, h.meta.score))

# Aggregations
for tag in response.aggregations.per_tag.buckets:
    print(tag.key, tag.mex_lines.value)

# Multisearch

from elasticsearch_dsl import MultiSearch, Search
ms = MultiSearch(index='blogs')
ms = ms.add(Search().filter('term', tags='python'))
ms = ms.add(Search().filter('term', tags='elasticsearch'))

responses = ms.execute()

for response in responses:
    print("result for query %r." % response.search.query)
    for hit in response:
        print(hit.title)



##################################
# PERSISTENCE

# Mappings

from elasticsearch_dsl import Keyword, Mapping, Nested, Text
Example #33
0
    def validate(cls, address):
        try:
            if "fullAddress" not in address:
                address["fullAddress"] = ""

            if "source" not in address:
                address["source"] = ""

            ms = MultiSearch(index=ADDRESSES_INDEX)

            should = []
            if "postalCode" in address:
                should.append(Q("match", postalCode=address["postalCode"]))
            if "region" in address:
                should.append(Q("match", region=address["region"]))

            if "fullAddress" in address:
                ms = ms.add(
                    cls.search().query(
                        "bool",
                        must=Q(
                            "simple_query_string",
                            fields=["all.shingle"],
                            query=address["fullAddress"],
                            default_operator="or",
                        ),
                        should=should,
                    )
                ).add(
                    cls.search().query(
                        "bool",
                        must=Q(
                            "simple_query_string",
                            fields=["all"],
                            query=address["fullAddress"],
                            default_operator="or",
                        ),
                        should=should,
                    )
                )

            if address["source"]:
                ms = ms.add(
                    cls.search().query(
                        "bool",
                        must=Q(
                            "simple_query_string",
                            fields=["all"],
                            query=address["source"],
                            default_operator="or",
                        ),
                        should=should,
                    )
                )

            responses = ms.execute()

            new_address = {}
            max_score = 0
            for resp in responses:
                if resp.hits.max_score is not None and resp.hits.max_score >= max_score:
                    new_address = resp[0].to_dict()
                    max_score = resp.hits.max_score

            if new_address:
                address["fullAddress"] = ""

                if new_address.get("postalCode"):
                    address["postalCode"] = new_address["postalCode"].rjust(5, "0")
                    address["fullAddress"] += address["postalCode"]

                if new_address.get("region"):
                    address["region"] = new_address["region"]

                if new_address.get("region") not in ["місто Київ", "місто Севастополь"]:
                    address["fullAddress"] += ", " + address["region"]

                if new_address.get("district"):
                    address["district"] = new_address["district"]
                    address["fullAddress"] += ", " + address["district"]

                if address.get("locality") and new_address.get("locality"):
                    if (
                        distance(
                            new_address["locality"].lower(), address["locality"].lower()
                        )
                        < 6
                        or distance(
                            new_address["oldLocality"].lower(),
                            address["locality"].lower(),
                        )
                        < 6
                    ):
                        address["locality"] = new_address["locality"]

                    address["fullAddress"] += ", " + address["locality"]
                elif new_address.get("locality"):
                    address["locality"] = new_address["locality"]
                    address["fullAddress"] += ", " + address["locality"]

                if address.get("streetAddress") and new_address.get("street"):
                    if (
                        distance(
                            new_address["street"].lower(),
                            address["streetAddress"].lower(),
                        )
                        < 6
                        or distance(
                            new_address["oldStreet"].lower(),
                            address["streetAddress"].lower(),
                        )
                        < 6
                    ):
                        address["streetAddress"] = new_address["street"]

                    address["fullAddress"] += ", " + address["streetAddress"]

                if new_address.get("oldStreet"):
                    address["oldStreet"] = new_address["oldStreet"]

                if new_address.get("oldDistrict"):
                    address["oldDistrict"] = new_address["oldDistrict"]

                if new_address.get("oldLocality"):
                    address["oldLocality"] = new_address["oldLocality"]

                if address.get("streetNumber"):
                    address["streetAddress"] += ", " + address["streetNumber"]
                    address["fullAddress"] += ", " + address["streetNumber"]

                    del address["streetNumber"]
        except (ValueError, KeyError, IndexError) as e:
            print(e)
            return address

        return address
#!/usr/bin/env python

from elasticsearch import Elasticsearch
from elasticsearch_dsl import MultiSearch, Search

client = Elasticsearch(['192.168.33.108:9200','192.168.33.109:9200'])

# multi search "hello" on message field.
ms = MultiSearch(using=client,index='logstash-*')
ms = ms.add(Search().query("match", message="hello"))
ms = ms.add(Search().query("match", message="hello7"))
responses = ms.execute()

for response in responses:
    for r in response:
        print(r['host'], r['message'])
Example #35
0
    def run(self, network, channels, query, author=None, date_range=None):
        # We don't support non-ajax, so will always have date range
        assert date_range
        date_begin, date_end = date_range

        result = Search(
            using=self.es, index='moffle',
        ).query(
            "match", text=query,
        ).query(
            "range", date={
                'gt': date_begin.strftime('%Y%m%d'),
                'lte': date_end.strftime('%Y%m%d'),
            },
        ).filter(
            "terms", line_type=['normal', 'action'],
        ).filter(
            "term", network=network,
        ).filter(
            "terms", channel=channels,
        ).sort(
            "-date",
        )[:10000].execute()

        hits = []
        # TODO: interval merging
        ctx_search = MultiSearch(using=self.es, index='moffle')
        for hit in result:
            # Fetch context
            ctx_search = ctx_search.add(Search(
                using=self.es,
                index='moffle',
            ).query(
                "range", line_no={
                    "gte": hit.line_no - config.SEARCH_CONTEXT,
                    "lte": hit.line_no + config.SEARCH_CONTEXT,
                },
            ).filter(
                "term", network=hit.network,
            ).filter(
                "term", channel=hit.channel,
            ).filter(
                "term", date=hit.date,
            ).sort(
                "line_no",
            ))

        ctx_results = ctx_search.execute()
        for hit, ctx_result in zip(result, ctx_results):
            lines = []
            for ctx_hit in ctx_result:
                lines.append(self._format_line(
                    ctx_hit,
                    is_hit=(hit.line_no == ctx_hit.line_no),
                ))
            hit = Hit(
                channel=hit.channel,
                date=hit.date,
                begin=lines[0].line_no,
                lines=lines,
            )
            hits.append(hit)

        hits = [list(group) for _, group in groupby(hits, key=lambda hit: hit.date)]
        return hits