Example #1
0
 def get_by_unique_key(self, unique_key, name):
     term = 'unique_keys.' + unique_key
     # had to use ** kw notation because of variable in field name
     search = Search(using=self.es)
     search = search.filter('term', **{term: name})
     search = search.extra(version=True)
     return self._one(search)
Example #2
0
 def get_rev_links(self, model, rel, *item_types):
     search = Search(using=self.es)
     search = search.extra(size=SEARCH_MAX)
     # rel links use '~' instead of '.' due to ES field restraints
     proc_rel = rel.replace('.', '~')
     # had to use ** kw notation because of variable in field name
     search = search.filter('term', **{'links.' + proc_rel: str(model.uuid)})
     if item_types:
         search = search.filter('terms', item_type=item_types)
     hits = search.execute()
     return [hit.to_dict().get('uuid', hit.to_dict().get('_id')) for hit in hits]
Example #3
0
File: app.py Project: urykhy/stuff
    def search(self, **params):
        limit_cat = params.get('cat', "").strip()
        limit_forum = params.get('forum', "").strip()
        limit_count = int(params.get('count', 100))
        limit_size_min = human2bytes(params.get('min', "0b"))
        limit_size_max = human2bytes(params.get('max', "0b"))
        limit_wild = int(params.get('wild', 0))
        arg = params.get('query', '').strip()
        if not arg:
            arg = "hobbit"

        s = Search(using=es, index=ela_index)
        if limit_size_min:
            s = s.filter("range", size = {'gte' : limit_size_min })
        if limit_size_max:
            s = s.filter("range", size = {'lte' : limit_size_max })

        arg = arg.split(' ')
        if limit_wild:
            q = Q("wildcard", name="*"+arg.pop(0)+"*")
            for a in arg:
                q = q & Q("wildcard", name="*"+a+"*")
        else:
            q = Q("match", name=arg.pop(0))
            for a in arg:
                q = q & Q("match", name=a)

        if len(limit_cat):
            for a in limit_cat.split(' '):
                q = q & Q("match", category=a)
        if len(limit_forum):
            for a in limit_forum.split(' '):
                q = q & Q("match", forum=a)

        s = s.query(q)
        #cherrypy.log("query is "+str(s.to_dict()))
        r = s.execute()
        size = r.hits.total
        #cherrypy.log("query have "+str(size)+" elements")
        if size > limit_count:
            size = limit_count
        s = s.sort('-size')
        s = s.extra(size=size)
        r = s.execute()

        data = []
        for b in r:
            a = [b.id, b.size, b.name, b.category, b.forum, b.date[0] if b.date else '', b.hash]
            data.append(a)

        return {'data': data}
def avg_bytes_per_request():  # return the average size of the object returned to the client for each URL
    avg_bytes_per_request = {}
    s = Search(index="my_index")
    s.aggs.bucket("per_request", "terms", field="request.untouched", size=0).metric(
        "avg_bytes", "avg", field="bytes"
    )  # group) results by URL, then average the byte size
    response = s.extra(size=10000).execute()
    for per_request in response.aggregations.per_request.buckets:
        avg_bytes_per_request[
            per_request.key
        ] = (
            per_request.avg_bytes.value
        )  # fill the dictionary with each different URL associated with the average size of the object they return
    return avg_bytes_per_request
Example #5
0
File: app.py Project: urykhy/stuff
    def search(self, **params):
        limit_author = params.get('author', "").strip()
        limit_title = params.get('title', "").strip()
        limit_count = int(params.get('count', 10))
        limit_wild = int(params.get('wild', 0))
        q = None

        if not limit_author and not limit_title:
            limit_title = "hobbit"

        s = Search(using=es, index=ela_index)
        arg = limit_title.split(' ')
        arg = [x for x in arg if x]
        if len(arg):
            if limit_wild:
                q = Q("wildcard", title="*"+arg.pop(0)+"*")
                for a in arg:
                    q = q & Q("wildcard", title="*"+a+"*")
            else:
                q = Q("match", title=arg.pop(0))
                for a in arg:
                    q = q & Q("match", title=a)

        arg = limit_author.split(' ')
        arg = [x for x in arg if x]
        if len(arg):
            for a in arg:
                if q:
                    q = q & Q("match", author=a)
                else:
                    q = Q("match", author=a)

        s = s.query(q)
        #cherrypy.log("query is "+str(s.to_dict()))
        r = s.execute()
        size = r.hits.total
        if size > limit_count:
            size = limit_count
        s = s.sort('-date')
        s = s.extra(size=size)
        r = s.execute()
        #cherrypy.log("result is "+str(r))

        data = []
        for b in r:
            a = [b.id, b.author, b.title, b.size, b.date]
            data.append(a)

        return {'data': data}
def avg_nb_con_per_request_per_clientip():  # return the average number of requests made by each client to each page
    avg_nb_con_per_request_per_clientip = {}  # initiate the return dictionary
    s = Search(index="my_index")
    s.aggs.bucket("per_request", "terms", field="request.untouched", size=0).bucket(
        "per_clientip", "terms", field="clientip", size=10000
    )  # group results by URL, then by client IP
    response = s.extra(size=10000).execute()  # execute the query
    for per_request in response.aggregations.per_request.buckets:  # for each distinct URL
        nb_con_per_request = 0  # initiate the occurences counter
        for per_clientip in per_request.per_clientip.buckets:  # for each distinct client IP
            nb_con_per_request += per_clientip.doc_count  # count the occurrences
        avg_nb_con_per_request_per_clientip[per_request.key] = nb_con_per_request / len(
            per_request.per_clientip.buckets
        )  # fill the dictionary with each different URL associated with its per client consultation average
    return avg_nb_con_per_request_per_clientip
Example #7
0
def search_command():
    """Performs a search in Elasticsearch."""
    index = demisto.args().get('index')
    query = demisto.args().get('query')
    fields = demisto.args().get('fields')  # fields to display
    explain = 'true' == demisto.args().get('explain')
    base_page = int(demisto.args().get('page'))
    size = int(demisto.args().get('size'))
    sort_field = demisto.args().get('sort-field')
    sort_order = demisto.args().get('sort-order')

    es = elasticsearch_builder()

    que = QueryString(query=query)
    search = Search(using=es,
                    index=index).query(que)[base_page:base_page + size]
    if explain:
        # if 'explain parameter is set to 'true' - adds explanation section to search results
        search = search.extra(explain=True)

    if fields is not None:
        fields = fields.split(',')
        search = search.source(fields)

    if sort_field is not None:
        search = search.sort({sort_field: {'order': sort_order}})

    response = search.execute().to_dict()

    total_dict, total_results = get_total_results(response)
    search_context, meta_headers, hit_tables, hit_headers = results_to_context(
        index, query, base_page, size, total_dict, response)
    search_human_readable = tableToMarkdown('Search Metadata:',
                                            search_context,
                                            meta_headers,
                                            removeNull=True)
    hits_human_readable = tableToMarkdown('Hits:',
                                          hit_tables,
                                          hit_headers,
                                          removeNull=True)
    total_human_readable = search_human_readable + '\n' + hits_human_readable
    full_context = {
        'Elasticsearch.Search(val.Query == obj.Query && val.Index == obj.Index '
        '&& val.Server == obj.Server && val.Page == obj.Page && val.Size == obj.Size)':
        search_context
    }

    return_outputs(total_human_readable, full_context, response)
def referrers_per_request():  # return the count of occurrences for each page/previous page couple
    referrers_per_request = {}
    s = Search(index="my_index")
    s.aggs.bucket("per_request", "terms", field="request.untouched", size=0).bucket(
        "per_referrer", "terms", field="referrer.untouched", size=10000
    )
    response = s.extra(size=10000).execute()
    for per_request in response.aggregations.per_request.buckets:  # for each distinct URL
        referrers_per_request[per_request.key] = {}
        for per_referrer in per_request.per_referrer.buckets:  # for each distinct previous page
            referrers_per_request[per_request.key][
                per_referrer.key
            ] = (
                per_referrer.doc_count
            )  # fill the dictionary with each different URL associated with each different previous one with the number of occurrences for each of these relationships
    return referrers_per_request
Example #9
0
def get_already_imported_ids(es, es_index_prefix, es_type_name):
    """
    Returns existing EL-ids of provided index and type.

    :param es: es-connection instance
    :param es_index_prefix: ``str`` Index prefix
    :param es_type_name: ``str`` ES document type name
    :return: ``set`` Set of already imported ids (Read time if id exists: O(1))
    """
    index_name = es_index_prefix.format('*')
    s = Search(using=es, index=index_name, doc_type=es_type_name)
    s = s.extra(stored_fields=[])
    ids = set()
    for h in s.scan():
        ids.add(h.meta.id)
    return ids
Example #10
0
def get_inbound_refs(
    es_client: Any,
    release_ident: Optional[str] = None,
    work_ident: Optional[str] = None,
    openlibrary_work: Optional[str] = None,
    url: Optional[str] = None,
    consolidate_works: bool = False,
    filter_stage: List[str] = [],
    sort: Optional[str] = None,
    limit: int = 25,
    offset: Optional[int] = None,
    es_index: str = "fatcat_ref",
) -> RefHits:

    search = Search(using=es_client, index=es_index)

    if consolidate_works:
        search = search.extra(
            collapse={
                "field": "source_work_ident",
                "inner_hits": {
                    "name": "source_more",
                    "size": 0,
                },
            })

    if release_ident:
        search = search.filter("term", target_release_ident=release_ident)
    elif work_ident:
        search = search.filter("term", target_work_ident=work_ident)
    elif openlibrary_work:
        search = search.filter("term",
                               target_openlibrary_work=openlibrary_work)
    else:
        raise ValueError("require a lookup key")

    if filter_stage:
        search = search.filter("term", source_stage=filter_stage)

    if sort == "newest":
        search = search.sort("-source_year")
    elif sort == "oldest":
        search = search.sort("source_year")
    else:
        search = search.sort("-source_year")

    return _execute_ref_query(search, limit=limit, offset=offset)
Example #11
0
    def get_queryset(self, queryset, data):
        phrase = data.get('q')

        if 'models' not in data:
            models = self._supported_models
        else:
            models = data['models'].split(',')

        advanced = data.get('advanced')
        op, suffix = get_advanced_options(advanced)
        lang = get_language()

        per_model = data.get('per_model', 1)
        ms = MultiSearch(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)

        for model in models:
            if is_enabled('S39_filter_by_geodata.be'
                          ) and model in self._completion_models:
                sug_query = Search(index=f'{model}s')
                sug_query = sug_query.suggest('title',
                                              phrase,
                                              completion={
                                                  'field':
                                                  f'title.{lang}.suggest',
                                                  'size': per_model
                                              })
                res = sug_query.execute()
                suggestions = res.suggest['title'][0]
                ids = [sug['_id'] for sug in suggestions['options']]
                query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)
                query = query.filter('term', model=model).query('ids',
                                                                values=ids)
            else:
                query = Search(index=settings.ELASTICSEARCH_COMMON_ALIAS_NAME)
                query = query.filter('term', model=model)
                query = query.query('bool',
                                    should=[
                                        nested_query_with_advanced_opts(
                                            phrase, field, lang, op, suffix)
                                        for field in ('title', 'notes')
                                    ])
                query = query.extra(size=per_model)
            ms = ms.add(query)

        return ms
Example #12
0
    async def get(self):
        """Get the results from Elasticsearch."""
        q = self.request.query.get("q")
        if not q:
            return web.json_response([])

        es = Elasticsearch(
            hosts=[self.request.app["settings"].ELASTICSEARCH_URL],
            timeout=ELASTICSEARCH_TIMEOUT,
            verify_certs=ELASTICSEARCH_VERIFY_CERTS,
        )
        mapping = es.indices.get_mapping(ELASTICSEARCH_INDEX,
                                         include_type_name=True)
        search = Search(index=ELASTICSEARCH_INDEX, using=es)
        search = search.highlight_options(
            pre_tags=[PRE_HIGHLIGHT_TAG],
            post_tags=[POST_HIGHLIGHT_TAG],
        )
        query = self.queries(mapping, q)
        search = search.query(query)
        highlights = self.build_highlight(
            mapping[ELASTICSEARCH_INDEX]["mappings"]["_doc"]["properties"])

        for highlight in highlights:
            search = search.highlight(highlight, type="plain")

        search = search.extra(
            from_=0,
            size=MAX_RESULTS,
        )

        values = []
        for hit in search.execute():
            hit._d_.pop(META, None)
            if HIGHLIGHT and hasattr(hit.meta, "highlight"):
                highlight = hit.meta.highlight
                query = DictQuery(hit._d_)
                for key in highlight:
                    path = key.split(".")[:-1]
                    value = highlight[key][0]
                    query.set("/".join(path), value)
                values.append(query)
            else:
                values.append(hit._d_)
        return web.json_response(values)
Example #13
0
def load_filtered_top_associations_search_after(filters, search_after=''):
    """Retrieves top associations and filter them through the tickable options"""
    s = Search(using=es, doc_type='associations')
    s = s.sort('-score', '_uid')
    s = filter_association_search(s, filters)
    if search_after != '':
        search_after = parse_lastel(search_after)
        print(search_after)
        s = s.extra(search_after=search_after)
    s = s[0:25]
    print(json.dumps(s.to_dict()))
    result = s.execute()
    associations = result['hits']['hits']
    last_el = result['hits']['hits'][-1]['sort']
    # Transformation needed to saveguard url transmition
    last_el[1] = "-".join(last_el[1].split('#'))
    return [association['_source'].to_dict()
            for association in associations], result['hits']['total'], last_el
Example #14
0
    def execute_query(
        self, es_search: es_dsl.Search, *, from_: int = 0, size: Optional[int] = None
    ) -> Dict:
        if from_ is None:
            raise ValueError("'from_' must have a value.")

        response = {"hits": {"hits": [], "total": 0}}

        if size is None or (from_ + size > search_settings.scan_limit):
            # tmp_search = es_search.extra(from_=0, size=0)
            # tmp_response = tmp_search.execute()
            # tot_hits = tmp_response.hits.total
            tot_hits = es_search.count()
            response["hits"]["total"] = tot_hits
            if size is None:
                size = tot_hits - from_
            if tot_hits < from_:
                return response

        if size + from_ <= search_settings.scan_limit:
            extra_kwargs = {}
            if from_ is not None:
                extra_kwargs["from_"] = from_
            if size is not None:
                extra_kwargs["size"] = size
            if extra_kwargs:
                es_search = es_search.extra(**extra_kwargs)
            return es_search.execute().to_dict()
        else:
            es_search = es_search.params(preserve_order=True, scroll="5m")
            # Workaround
            scan_iter = elasticsearch.helpers.scan(
                es_search._using,
                query=es_search.to_dict(),
                index=es_search._index,
                doc_type=es_search._get_doc_type(),
                **es_search._params,
            )
            # scan_iter = es_search.scan()
            for hit in itertools.islice(scan_iter, from_, from_ + size):
                response["hits"]["hits"].append(hit)
                # response["hits"]["hits"].append(hit.to_dict())

            return response
Example #15
0
    def listing(self,
                system=None,
                file_path=None,
                offset=0,
                limit=100,
                **kwargs):
        """Wrap the search result in a BaseFile object for serializtion."""
        query = self.construct_query(**kwargs)
        listing_search = Search()
        listing_search = listing_search.filter(query).sort(
            '_index',
            {'project._exact': {
                'order': 'asc',
                'unmapped_type': 'keyword'
            }}, {'created': {
                'order': 'desc',
                'unmapped_type': 'long'
            }})
        listing_search = listing_search.extra(from_=offset, size=limit)

        res = listing_search.execute()
        children = []
        for hit in res:
            try:
                getattr(hit, 'projectId')
                children.append(BaseESPublication(**hit.to_dict()).to_file())
            except AttributeError:
                children.append(
                    BaseESPublicationLegacy(**hit.to_dict()).to_file())

        result = {
            'trail': [{
                'name': '$SEARCH',
                'path': '/$SEARCH'
            }],
            'name': '$SEARCH',
            'path': '/',
            'system': system,
            'type': 'dir',
            'children': children,
            'permissions': 'READ'
        }
        return result
Example #16
0
def get_document_text_slice(self, slice_count=0, slice_size=1000, slice_id=0):

    s = Search(using=self.es, index=self.index,
               doc_type='items').query(Q({"match_all":
                                          {}})).params(scroll='5m',
                                                       size=slice_size)
    # s = s.extra(slice={"id": work, "max": 1})
    s = s.extra(slice={'id': slice_id, 'max': slice_count})

    response = s.execute()

    # print("MIN ID:", min(map(int, ([h['_id'] for h in response.hits.hits]))))
    # print("MAX ID:", max(map(int, ([h['_id'] for h in response.hits.hits]))))
    # print("ID COUNT:", len([h['_id'] for h in response.hits.hits]))

    for document in response:
        if 'itemText' in document:
            yield document.meta.id, document['itemText']
        else:
            yield document.meta.id, ''
Example #17
0
 def get_queryset(self):
     try:
         keyword = self.request.GET['keyword']
         s = Search(using=es, index='recipe')
         s.update_from_dict({
             'query': {
                 'match': {
                     'name': {
                         'query': keyword,
                         'type': 'phrase_prefix',
                         'slop': 2
                     }
                 },
             }
         })
         s = s.extra(size=1000)
         results = s.execute()
         return results
     except MultiValueDictKeyError:
         pass
Example #18
0
    def _get_markets_with_dsl(self, from_date, to_date):
        # TODO: This could be fixed now as ES has closed the issue:
        # https://github.com/elastic/elasticsearch-dsl-py/issues/963

        s = Search(using='operations', index="deex-*")
        s = s.extra(size=0)
        s = s.query('bool', filter = [
            Q('term', operation_type=4),
            Q("range", block_data__block_time={'gte': from_date, 'lte': to_date})
        ])

        sources = [ 
            { 'base': A('terms', field='operation_history.op_object.fill_price.base.asset_id.keyword') },
            { 'quote': A('terms', field='operation_history.op_object.fill_price.quote.asset_id.keyword') }
        ]

        # Bug here as 'sources' does not support a list.
        a = A('composite', sources=sources, size=10000).metric('volume', 'sum', field='operation_history.op_object.fill_price.quote.amount')
        s.aggs.bucket('pairs', a)
        response = s.execute()
Example #19
0
    def get_daily_volume(self, from_date, to_date):
        s = Search(using='operations', index="deex-*")
        s = s.extra(size=0)
        s = s.query('bool', filter = [
            Q('term', operation_type=4),
            Q('range', block_data__block_time={'gte': from_date, 'lte': to_date}),
            Q('term', operation_history__op_object__fill_price__quote__asset_id__keyword=config.CORE_ASSET_ID)
        ])

        a = A('date_histogram', field='block_data.block_time', interval='1d', format='yyyy-MM-dd') \
                .metric('volume', 'sum', field='operation_history.op_object.fill_price.quote.amount')
        s.aggs.bucket('volume_over_time', a)

        response = s.execute()

        daily_volumes = []
        for daily_volume in response.aggregations.volume_over_time.buckets:
            daily_volumes.append({ 'date': daily_volume.key_as_string, 'volume': daily_volume.volume.value })
        
        return daily_volumes
Example #20
0
    def search_by_query(self, query: Query) -> Search:
        """
        Get Elasticsearch Search instance by given query object

        :param Query query: query object to construct ES's Search object
        :return: Search object constructed by given `query` param
        """
        def convert(name):
            return ESWords.ASC if name == SortOrder.ASC else ESWords.DESC

        extra_params = dict()
        sort_by = dict()
        search = Search(index=self._index, using=self._es_client)
        q = query.data

        if q.filter_by is not None:
            filter_by = self._query_converter.build(q.filter_by)
            search = search.query(filter_by)

        if q.offset is not None:
            extra_params["from_"] = q.offset
        if q.limit is not None:
            extra_params["size"] = q.limit

        if any((i is not None for i in (q.offset, q.limit))):
            search = search.extra(**extra_params)

        if q.order_by is not None:
            string_field_type = q.order_by.kwargs.get("string_field_type")
            if string_field_type is not None and \
                isinstance(q.order_by.field, StringType):
                sort_by[f"{field_to_str(q.order_by.field)}.{string_field_type}"] = \
                    {ESWords.ORDER: convert(q.order_by.order)}
            else:
                sort_by[field_to_str(q.order_by.field)] = {
                    ESWords.ORDER: convert(q.order_by.order)
                }
            search = search.sort(sort_by)

        return search
Example #21
0
    def _search_runs(
            self,
            experiment_ids: List[str],
            filter_string: str,
            run_view_type: str,
            max_results: int = SEARCH_MAX_RESULTS_DEFAULT,
            order_by: List[str] = None,
            page_token: str = None,
            columns_to_whitelist: List[str] = None) -> Tuple[List[Run], str]:

        if max_results > 10000:
            raise MlflowException(
                "Invalid value for request parameter max_results. It must be at "
                "most {}, but got value {}".format(10000, max_results),
                INVALID_PARAMETER_VALUE)
        stages = LifecycleStage.view_type_to_stages(run_view_type)
        parsed_filters = SearchUtils.parse_search_filter(filter_string)
        filter_queries = [
            Q("match", experiment_id=experiment_ids[0]),
            Q("terms", lifecycle_stage=stages)
        ]
        filter_queries += self._build_elasticsearch_query(parsed_filters)
        sort_clauses = self._get_orderby_clauses(order_by)
        s = Search(index="mlflow-runs").query('bool', filter=filter_queries)
        s = s.sort(*sort_clauses)
        if page_token != "" and page_token is not None:
            s = s.extra(search_after=ast.literal_eval(page_token))
        response = s.params(size=max_results).execute()
        columns_to_whitelist_key_dict = self._build_columns_to_whitelist_key_dict(
            columns_to_whitelist)
        runs = [
            self._hit_to_mlflow_run(hit, columns_to_whitelist_key_dict)
            for hit in response
        ]
        if len(runs) == max_results:
            next_page_token = response.hits.hits[-1].sort
        else:
            next_page_token = []
        return runs, str(next_page_token)
Example #22
0
    def listing(self,
                system=None,
                file_path=None,
                offset=0,
                limit=100,
                **kwargs):
        """Perform the search and output in a serializable format."""

        query = self.construct_query(system, file_path, **kwargs)
        listing_search = Search()
        listing_search = listing_search.filter(query).sort(
            '_index', {'created': {
                'order': 'desc',
                'unmapped_type': 'long'
            }})
        listing_search = listing_search.extra(
            from_=offset, size=limit).source(includes=[
                'project.value', 'created', 'projectId', 'users', 'system'
            ])
        res = listing_search.execute()
        children = []
        for hit in res:
            hit_to_file = BaseESPublication.hit_to_file(hit)
            children.append(hit_to_file)
        result = {
            'trail': [{
                'name': '$SEARCH',
                'path': '/$SEARCH'
            }],
            'name': '$SEARCH',
            'path': '/',
            'system': system,
            'type': 'dir',
            'children': children,
            'permissions': 'READ'
        }
        return result
Example #23
0
    def listing(self, system, file_path, offset=0, limit=100, **kwargs):
        """Perform the search and output in a serializable format."""

        query = self.construct_query(system, file_path)
        listing_search = Search()
        listing_search = listing_search.query(query)
        listing_search = listing_search.extra(from_=offset, size=limit)
        res = listing_search.execute()
        
        children = []
        print res.hits.total
        if res.hits.total:
            children = [o.to_dict() for o in res]

        result = {
            'trail': [{'name': '$SEARCH', 'path': '/$SEARCH'}],
            'name': '$SEARCH',
            'path': '/',
            'system': system,
            'type': 'dir',
            'children': children,
            'permissions': 'READ'
        }
        return result
Example #24
0
    def listing(self,
                system=None,
                file_path=None,
                offset=0,
                limit=100,
                **kwargs):
        """Perform the search and output in a serializable format."""

        query = self.construct_query(system, file_path, **kwargs)
        listing_search = Search()
        listing_search = listing_search.filter(query).sort('_index')
        listing_search = listing_search.extra(from_=offset, size=limit)
        res = listing_search.execute()
        children = []
        for hit in res:
            try:
                getattr(hit, 'projectId')
                hit_to_file = BaseESPublication.hit_to_file(hit)
                children.append(hit_to_file)
            except AttributeError:
                children.append(
                    BaseESPublicationLegacy(**hit.to_dict()).to_file())

        result = {
            'trail': [{
                'name': '$SEARCH',
                'path': '/$SEARCH'
            }],
            'name': '$SEARCH',
            'path': '/',
            'system': system,
            'type': 'dir',
            'children': children,
            'permissions': 'READ'
        }
        return result
Example #25
0
class LexEsSearch(object):
    '''Search class for pglex app.'''
    def __init__(self, query={}, project=None, index_ver=None, using=None):
        index = 'lex_{:}_{:}-lex'.format(project, index_ver)
        self.s = Search(using=using, index=index)
        self.project = project
        self.index_ver = index_ver
        self.query = query
        self.results = {}
        self.explain = ('explain' in list(query.keys())
                        and query['explain'] == 'true')
        self.includes_q = 'q' in list(query.keys())

    def build_search(self):
        if self.includes_q:
            self.add_q()
        self.add_filters()
        self.add_popularity()
        self.add_paging()
        self.add_sort()
        self.add_includes()
        self.add_explain()

    def add_q(self):
        '''String-based query with results returned by relevance.'''
        q = self.query['q']
        wildcarding = '*' in q or '?' in q
        if wildcarding is True:
            add_contact_lg = False
        else:
            add_contact_lg = True
        my_search_fields = get_search_fields(self.project,
                                             self.index_ver,
                                             self.query,
                                             add_target_lang=True,
                                             add_contact_lang=add_contact_lg)
        myboosts = boosts
        sfields = []
        for searchfield in my_search_fields:
            try:
                sfield, boost = searchfield.split('^')
            except ValueError:  # No '^' in string
                sfield = searchfield
                try:
                    boost = myboosts[sfield]
                except KeyError:
                    boost = '1'
            if not wildcarding:
                sfield += '^' + boost
            else:
                sfield = sfield.replace('.', '__')
                sfield = {sfield: {'value': q.lower(), 'boost': boost}}
            sfields.append(sfield)
        if wildcarding is True:
            queries = []
            for sfield in sfields:
                queries.append(Q("wildcard", **sfield))
            boolquery = Q('bool', should=queries, minimum_should_match=1)
            self.s = self.s.query(boolquery)
        else:
            self.s = self.s.query('multi_match', query=q, fields=sfields)

    def add_filters(self):
        '''Filter queries that entries must match, e.g. part of speech.'''
        for filt in filter_fields:
            try:
                p = self.query[filt]
                # If filter came in as a query (url) param.
                #if type(p) == str and filt_pat.match(p):
                #    p = {filt: p}
                #    app.log.debug('jsonp: ' + p)
                if isinstance(p, list):
                    termtype = 'terms'
                #    pstr = ','.join(p)
                else:
                    termtype = 'term'
                #    pstr = p
                #app.log.debug('termtype: ' + termtype + '; filtp: ' + filt + ' -> ' + pstr)
                self.s = self.s.filter(termtype, **{filt: p})
            except KeyError:
                pass  # filter not included in query

    def add_popularity(self):
        '''Scale scores by a 'popularity' factor calculated from popcnt field
        or a random function. A random seed can be provided if a reproducible
        random sort is required. If paging through a randomized set of results,
        for example, then use the same seed when retrieving each page set.

        If no string search is in the query (i.e. only filters are used), then
        the _score values will be 0.0, in which case multiplying by the factor
        will have no effect, so the factor value replaces _score instead.
        This means documents will be scored based only on the popularity
        factor.'''
        if self.includes_q:
            mode = 'multiply'
        else:
            mode = 'replace'
        try:
            if self.query['pf'] == 'rand':
                try:
                    randarg = {'seed': self.query['seed'], 'field': '_seq_no'}
                except KeyError:
                    randarg = {}
                freqq = Q('function_score',
                          query=self.s.query,
                          random_score=randarg,
                          boost_mode=mode)
                self.s = self.s.query(freqq)
            elif self.query['pf'] != '0':
                freqq = Q('function_score',
                          query=self.s.query,
                          field_value_factor={
                              'field': 'popcnt',
                              'modifier': 'ln1p',
                              'factor': int(self.query['pf']),
                              'missing': 1
                          },
                          boost_mode=mode)
                self.s = self.s.query(freqq)
        except KeyError:
            pass

    def add_paging(self):
        '''Return a page of results. The default is the first 10 entries.'''
        try:
            size = int(self.query['size'])
        except KeyError:
            size = 10
        try:
            getfrom = int(self.query['from'])
        except KeyError:
            getfrom = 0
        self.s = self.s[getfrom:getfrom + size]

    def add_sort(self):
        sortfld = '_score'
        sortparams = {'order': 'desc'}
        try:
            keys = list(self.query.keys())
            assert ('sort' in keys or 'order' in keys or 'sortmode' in keys)
            if 'sort' in keys:
                sortfld = self.query['sort']
            if 'order' in keys:
                sortparams['order'] = self.query['order']
            else:
                if sortfld == '_score':
                    sortparams['order'] = 'desc'
                else:
                    sortparams['order'] = 'asc'
            if 'sortmode' in keys:
                sortparams['mode'] = self.query['sortmode']
        except AssertionError:
            pass
        self.s = self.s.sort({sortfld: sortparams})

    def add_includes(self):
        try:
            inc_fields = self.query['inc'].split(',')
            self.s = self.s.source({'include': inc_fields})
        except KeyError:
            self.s = self.s.source(source_fields)

    def add_explain(self):
        if self.explain is True:
            self.s = self.s.extra(explain=True)
Example #26
0
def dump_slice(slice_no):
    s = Search()
    s = s.extra(slice={"id": slice_no, "max": SLICES})
    for d in s.scan():
        print(d.meta.id)
Example #27
0
def locations_generator(**kwargs):
    import datetime

    from geo.models import Area, District, Locality
    from mainapp.documents import DocumentLocation
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_DOCUMENT_EVAL

    from elasticsearch_dsl import Search, Q

    criterion_tm_duos = kwargs['criterion_tm_duos']  # ((tm_1, criterion_id_1)....()...())

    for places in (Area, District, Locality):
        location_level = places.objects.first()._meta.verbose_name
        if places == Area:
            print('!!! Parsing Areas ...', datetime.datetime.now())
        if places == District:
            print('!!! Parsing Districts ...', datetime.datetime.now())
        else:
            print('!!! Parsing Localities ...', datetime.datetime.now())
        for i, geo in enumerate(places.objects.all()):
            s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)\
                .source(['datetime', 'source', 'text', 'text_lemmatized', 'title', 'text_lemmatized_yandex'])
            q = Q(
                'bool',
                should=[Q("match_phrase", text_lemmatized=geo.name)] +
                       [Q("match_phrase", text=geo.name)] +
                       [Q("match_phrase", title=geo.name)] +
                       [Q("match_phrase", text_lemmatized_yandex=geo.name)],
                minimum_should_match=1,
            )
            s = s.query(q)
            s = s.extra(track_scores=True)
            print(f'!!! Scans count for {i} geo inside place: ', s.count(), datetime.datetime.now())
            scans = s.scan()

            for scan_obj in scans:

                document_datetime, document_source = hit_parser(scan_obj)

                doc = DocumentLocation(
                    document_es_id=scan_obj.meta.id,
                    document_datetime=document_datetime,
                    document_source=document_source,
                    location_name=geo.name,
                    location_level=location_level,
                    location_weight=scan_obj.meta.score,
                    location_id=geo.id,
                )

                for tm, criterion_id in criterion_tm_duos:
                    ev_docs = Search(using=ES_CLIENT, index=f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion_id}") \
                        .filter("term", document_es_id=scan_obj.meta.id) \
                        .source(['value', 'document_datetime', 'document_source']) \
                        .execute()

                    if not ev_docs:
                        continue

                    ev_docs = ev_docs[0]

                    value = ev_docs.value if hasattr(ev_docs, "value") and ev_docs.value else None

                    doc[f'criterion_{tm}_{criterion_id}'] = value

                yield doc
Example #28
0
def search(search_params, index, page_size, ip, page=1) -> Response:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `~cccatalog.api.search_serializers.SearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param page: The results page number.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :return: An Elasticsearch Response object.
    """
    s = Search(index=index)

    # Paginate search query.
    start_slice = page_size * (page - 1)
    end_slice = page_size * page
    if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW:
        raise ValueError("Deep pagination is not allowed.")
    s = s[start_slice:end_slice]

    # If any filters are specified, add them to the query.
    if 'li' in search_params.data or 'lt' in search_params.data:
        license_field = 'li' if 'li' in search_params.data else 'lt'
        license_filters = []
        for _license in search_params.data[license_field].split(','):
            license_filters.append(Q('term', license__keyword=_license))
        s = s.filter('bool', should=license_filters, minimum_should_match=1)
    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    if 'q' in search_params.data:
        s = s.query('constant_score',
                    filter=Q(
                        'query_string',
                        query=search_params.data['q'],
                        fields=['tags.name', 'title'],
                    ))
    else:
        if 'creator' in search_params.data:
            creator = search_params.data['creator']
            s = s.query('constant_score',
                        filter=Q('query_string',
                                 query=creator,
                                 default_field='creator'))
        if 'title' in search_params.data:
            title = search_params.data['title']
            s = s.query('constant_score',
                        filter=Q('query_string',
                                 query=title,
                                 default_field='title'))
        if 'tags' in search_params.data:
            tags = search_params.data['tags']
            s = s.query('constant_score',
                        filter=Q('query_string',
                                 default_field='tags.name',
                                 query=tags))

    s.extra(track_scores=True)
    s = s.params(preference=str(ip))
    search_response = s.execute()
    return search_response
Example #29
0
def search(search_params, index, page_size, ip, request,
           filter_dead, page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Add requested filters.
    if 'li' in search_params.data:
        s = _filter_licenses(s, search_params.data['li'])
    elif 'lt' in search_params.data:
        s = _filter_licenses(s, search_params.data['lt'])

    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)
    if 'extension' in search_params.data:
        extension = search_params.data['extension']
        extension_filter = Q('term', extension=extension)
        s = s.filter('bool', should=extension_filter, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(
            key=filter_cache_key,
            timeout=CACHE_TIMEOUT,
            value=filtered_providers
        )
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query(
            'query_string',
            query=query,
            fields=search_fields,
            type='most_fields'
        )
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query(
                'query_string', query=creator, default_field='creator'
            )
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query(
                'query_string', query=title, default_field='title'
            )
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query(
                'query_string',
                default_field='tags.name',
                query=tags
            )

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip))
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    search_response = s.execute()
    results = _post_process_results(
        s,
        start,
        end,
        page_size,
        search_response,
        request,
        filter_dead
    )

    result_count, page_count = _get_result_and_page_count(
        search_response,
        results,
        page_size
    )

    return results, page_count, result_count
    def apply_paging(self,
                     catalog: CatalogName,
                     es_search: Search,
                     pagination: Pagination,
                     peek_ahead: bool = True) -> Search:
        """
        Set sorting and paging parameters for the given ES search request.

        :param catalog: The name of the catalog to search in

        :param es_search: The Elasticsearch request object

        :param pagination: The sorting and paging settings to apply

        :param peek_ahead: If True, request one more hit so that
                           _generate_paging_dict can know if there is another
                           page. Use this to prevent a last page that's empty.
        """
        sort_field = pagination.sort + '.keyword'
        sort_order = pagination.order

        field_type = self.field_type(catalog,
                                     tuple(pagination.sort.split('.')))
        sort_mode = field_type.es_sort_mode

        def sort(order):
            assert order in ('asc', 'desc'), order
            return (
                {
                    sort_field: {
                        'order':
                        order,
                        'mode':
                        sort_mode,
                        'missing':
                        '_last' if order == 'asc' else '_first',
                        **({} if field_type.es_type is None else {
                               'unmapped_type': field_type.es_type
                           })
                    }
                },
                # This secondary sort field serves as the tie breaker for when
                # the primary sort field is not unique across documents.
                # Otherwise it's redundant, especially its the same as the
                # primary sort field. However, always having a secondary
                # simplifies the code and most real-world use cases use sort
                # fields that are not unique.
                {
                    'entity_id.keyword': {
                        'order': order
                    }
                })

        # Using search_after/search_before pagination
        if pagination.search_after is not None:
            es_search = es_search.extra(search_after=pagination.search_after)
            es_search = es_search.sort(*sort(sort_order))
        elif pagination.search_before is not None:
            es_search = es_search.extra(search_after=pagination.search_before)
            rev_order = 'asc' if sort_order == 'desc' else 'desc'
            es_search = es_search.sort(*sort(rev_order))
        else:
            es_search = es_search.sort(*sort(sort_order))

        # FIXME: Remove this or change to 10000 (the default)
        #        https://github.com/DataBiosphere/azul/issues/3770
        es_search = es_search.extra(track_total_hits=True)

        assert isinstance(peek_ahead, bool), type(peek_ahead)
        # fetch one more than needed to see if there's a "next page".
        es_search = es_search.extra(size=pagination.size + peek_ahead)
        return es_search
Example #31
0
def make_query(q, lon=None, lat=None, match_all=True, limit=15, filters=None):
    if filters is None:
        filters = {}
    s = Search(es).index(INDEX)
    should_match = '100%' if match_all else '2<-1 6<-2 8<-3 10<-50%'
    match = Q(
        'bool',
        must=[Q('match', collector={
            'fuzziness': 1,
            'prefix_length': 2,
            'query': q,
            'minimum_should_match': should_match,
            'analyzer': 'search_stringanalyzer'
        })],
        should=[
            Q('match', **{'name.keywords': {
                'query': q,
                'boost': 2,
                'analyzer': 'search_stringanalyzer'
            }}),
            Q('match', **{'street.keywords': {
                'query': q,
                'boost': 2,
                'analyzer': 'search_stringanalyzer'
            }}),
            Q('match', **{'city.default': {
                'query': q,
                'boost': 2,
                'analyzer': 'search_stringanalyzer'
            }}),
            Q('match', **{'way_label': {
                'query': q,
                'boost': 2,
                'analyzer': 'search_stringanalyzer'
            }}),
            Q('match', **{'housenumber': {
                'query': q,
                'boost': 2,
                'analyzer': 'housenumber_analyzer'
            }}),
        ]
    )

    functions = [{
        "script_score": {
            "script": "1 + doc['importance'].value * 40",
            "lang": "groovy"
        }
    }]
    if lon is not None and lat is not None:
        functions.append({
            "script_score": {
                "script": "dist = doc['coordinate'].distanceInKm(lat, lon); 1 / (0.5 - 0.5 * exp(-5*dist/maxDist))",
                "lang": "groovy",
                "params": {
                    "lon": lon,
                    "lat": lat,
                    "maxDist": 100
                }
            }
        })

    fscore = Q(
        'function_score',
        score_mode="multiply",
        boost_mode="multiply",
        query=match,
        functions=functions
    )

    s = s.query(fscore)
    # Only filter out 'house' if we are not explicitly asking for this
    # type.
    if filters.get('type') is not 'housenumber':
        # We don't want results with an ordinal (bis, ter…) if the ordinal
        # field itself doesn't match
        filter_ordinal = F('or', [
            F('missing', field="ordinal"),
            F({"query": {"match": {"ordinal": {"query": q, "analyzer": "housenumber_analyzer"}}}}),
        ])
        house_query = Filtered(query=Match(housenumber={"query": q, "analyzer": "housenumber_analyzer"}), filter=filter_ordinal)
        filter_house = F('or', [
            F('missing', field="housenumber"),
            F('exists', field="name.keywords"),
            F({'query': house_query.to_dict()}),
        ])
        s = s.filter(filter_house)
    if filters:
        # We are not using real filters here, because filters are not analyzed,
        # so for example "city=Chauny" will not match, because "chauny" is in
        # the index instead.
        for k, v in filters.items():
            s = s.query({'match': {k: v}})
    return s.extra(size=limit)
Example #32
0
    def get_context_data(self, **kwargs):
        context = super(RecipeDetailView, self).get_context_data(**kwargs)
        recipe = self.get_queryset().all()[0]
        # increment count
        if not self.request.session.get('recipe_viewed_%s' % recipe.pk, None):
            recipe.increment_views()
            self.request.session['recipe_viewed_%s' % recipe.pk] = 1
        logger.error('recipe views %s', recipe.views)
        logger.error('recipe session %s', self.request.session.keys())
        course_info = recipe.courses.all()
        holiday_info = recipe.holidays.all()


        context['title'] = context['recipe'].name

        s = Search(using=es, index='recipe')

        exclude_clause = []
        match_clause = []

        exclude_clause.append(
            {"term": {"document_id": recipe.id}}
        )
        if course_info:
            course_id = course_info[0].id
            match_clause.append({'match': {'courses': {'query': course_id, 'boost': 5}}})
        if holiday_info:
            holiday_id = holiday_info[0].id
            match_clause.append({'match': {'holidays': holiday_id}})

        match_clause.append({'match': {'name': {'query': recipe.name, 'boost': 2}}})

        s = Search(using=es, index='recipe')
        s.update_from_dict({
            'query': {
                'function_score': {
                    'query': {
                        'bool': {
                            "must_not": exclude_clause,
                            'should': match_clause
                        }
                    },
                    'random_score': {
                        'seed': 12371203
                    }
                }
            }
        })

        s = s.extra(size=6)
        results = s.execute()
        context['suggested_recipes'] = results

        if self.request.user.is_authenticated():
            user_collection = UserCollection.objects.filter(user=self.request.user)
            recipe_collection = RecipesCollection.objects\
                .filter(recipe_id=context['recipe'].id, collection__user=self.request.user)\
                .only('collection__id')
            user_recipe_collection = set(i.collection_id for i in recipe_collection.all())
            initial = {
                'recipes': self.kwargs.get("pk"),
                'recipe_collection': user_collection,
                'user_recipe_collection': user_recipe_collection
            }
            context['form'] = UserRecipeCollectionForm(
                initial=initial
            )
        context['searchform'] = SearchKeywordForm()
        context['current_recipe_name'] = recipe.name
        return context
    def executeTermQuery(self, t, user_terminology, query_type):
        size = self.query_size_full
        if (query_type == "fullmatch"):
            q1 = Q({
                "multi_match": {
                    "query":
                    t,
                    "fuzziness":
                    0,
                    "fields": [
                        "name.fullmatch_exact^" + self.field_boost,
                        "name.fullmatch_folding"
                    ]
                }
            })
        elif (query_type == "fuzzy_fullmatch"):
            q_a = Q({
                "multi_match": {
                    "query":
                    t,
                    "fuzziness":
                    1,
                    "prefix_length":
                    self.prefix_length,
                    "fields": [
                        "name.fullmatch_exact^" + self.field_boost,
                        "name.fullmatch_folding"
                    ]
                }
            })
            q_b = Q({
                "multi_match": {
                    "query":
                    t,
                    "fuzziness":
                    "AUTO",
                    "prefix_length":
                    self.prefix_length,
                    "fields": [
                        "name.fullmatch_exact^" + self.field_boost,
                        "name.fullmatch_folding"
                    ]
                }
            })
            q1 = Q('bool', should=[q_a, q_b])
        else:
            size = self.query_size_shingle
            q1 = Q({
                "multi_match": {
                    "query":
                    t,
                    "fuzziness":
                    0,
                    "fields": [
                        "name.shinglematch_exact^" + self.field_boost,
                        "name.shinglematch_folding"
                    ]
                }
            })

        qFilter = Q('terms',
                    terminology_id=list(self.terminologies_dict.keys()))
        shoud_clause = []
        if user_terminology is not None:
            # limit results to terminologies related to specific domain(s)
            qShould1 = Q('constant_score',
                         filter=Q('terms', terminology_id=user_terminology),
                         boost=20)
            q = Q('bool', must=[q1], should=[qShould1], filter=[qFilter])
        else:
            for k, v in self.terminologies_dict.items():
                shoud_clause.append(
                    Q('constant_score',
                      filter=Q('term', terminology_id=k),
                      boost=v))
            #qShould_q = Q('constant_score', filter=Q('term', terminology_id=13), boost=self.quantity_terminology_boost) #added 21-02-2020 boost by quantity
            #qShould1 = Q('constant_score', filter=Q('terms',terminology_id=self.primary_terminology), boost=self.primary_terminology_boost)
            #qShould2 = Q('constant_score', filter=Q('terms', terminology_id=self.secondary_terminologies), boost=self.second_terminology_boost)
            q = Q('bool', must=[q1], should=shoud_clause, filter=[qFilter])
        #print(q.to_dict())
        s = Search(using=self.elasticSearchInst,
                   index=self.elastic_index,
                   doc_type=self.elastic_doctype).query(q)
        s = s.extra(size=size)

        response = s.execute()
        list_res = []
        return_val = []
        if response:
            response = response.to_dict()
            #print("%d documents found" % response ['hits']['total'])
            for hit in response['hits']['hits']:
                dictres = {
                    "id": int(hit['_id']),
                    "name": hit['_source']['name'],
                    "abbreviation": hit['_source']['abbreviation'],
                    "score": hit['_score'],
                    "terminology": hit['_source']['terminology']
                }
                if 'description_uri' in hit['_source']:
                    dictres['description_uri'] = hit['_source'][
                        'description_uri']
                if 'topics' in hit['_source']:
                    dictres['topics'] = hit['_source']['topics']
                list_res.append(dictres)

            if list_res:
                if query_type == "shinglematch":
                    #2020-03-05 do not apply max score filter for shingle match
                    fragment_vector = self.tokenize_string(
                        t
                    )  #Counter({'temperature': 1, 'sea': 1, 'surface': 1})
                    #print('fragment_vector ',fragment_vector)
                    list_ids = [str(d['id']) for d in list_res]
                    tokenized_terms_dict = self.tokenize_by_ids(list_ids)
                    #print(tokenized_terms_dict)
                    list_ids_tuples = self.generateCombinationsByTermIds(
                        list_ids, len(t.split()))
                    final_ids = self.compute_cosine_sim(
                        tokenized_terms_dict, list_ids_tuples, fragment_vector)
                    #remove the records not in final_ids
                    return_val = [d for d in list_res if d['id'] in final_ids]
                else:
                    #return_val = [d for d in list_res if d['score'] == max_score]
                    #27-02-2020 for full and fuzzy match return term with max score (for duplicate terms only)
                    list_names = [d['name']
                                  for d in list_res]  # dont chnage to set
                    duplicates = {
                        item
                        for item, count in Counter(list_names).items()
                        if count > 1
                    }
                    remove_ids = []
                    for dup in duplicates:
                        mx = max(
                            {d['score']
                             for d in list_res if d['name'] == dup})
                        remove_ids.extend({
                            d['id']
                            for d in list_res
                            if d['name'] == dup and d['score'] < mx
                        })
                    return_val = [
                        d for d in list_res if d['id'] not in remove_ids
                    ]
        return return_val
Example #34
0
def build_search_object(q=None,
                        filters=None,
                        after=None,
                        offset=None,
                        limit=None,
                        fields=None,
                        facets=None,
                        facet_limits=None,
                        sort=None,
                        **kwargs):
    '''
    Given the parameters, creates a new elasticsearch-dsl Search object and returns it.

    :param q: a query string which will be searched against the meta.all field or a dict of fields
              and search values. If this is a dict then the keys (field names) are always prefixed
              with "data." unless the key is an empty string in which case the field uses is
              meta.all. This allows combination searches across meta.all and data.* fields.
    :param filters: a dict of fields and values to filter the result with. If a key is present that
                    is equal to "__geo__" then the value associated with it should be a dict which
                    will be treated as a geo query to be run against the `meta.geo` field. The value
                    should contain a "type" key which must have a corresponding value of "point",
                    "box" or "polygon" and then other keys that are dependant on the type:
                        - point:
                            - distance: the radius of the circle centred on the specified location
                                        within which records must lie to be matched. This can
                                        specified in any form that elasticsearch accepts for
                                        distances (see their doc, but values like 10km etc).
                            - point: the point to centre the radius on, specified as a lat, long
                                     pair in a list (i.e. [-20, 40.2]).
                        - box:
                            - points: the top left and bottom right points of the box, specified as
                                      a list of two lat/long pairs (i.e. [[-20, 40.2], [0.5, 100]]).
                        - polygon:
                            - points: a list of at least 3 lat/long pairs (i.e. [[-16, 44],
                                      [-13.1, 34.8], [15.99, 35], [5, 49]]).
    :param after: the search after value to start the search result from (for pagination). Cannot be
                  used in conjunction with offset. If both offset and after are provided then after
                  is used and offset is ignored.
    :param offset: the offset to start the search result from (for pagination)
    :param limit: the limit to stop the search result at (for pagination)
    :param fields: a list of field names to return in the result
    :param facets: a list of field names to return an aggregation of top 10 values and counts for
    :param facet_limits: a dict of fields and their customised top n limits
    :param sort: a list of fields to sort by with ordering. By default the fields are sorted
                 ascending, but by providing "desc" after the field name a descending sort will be
                 used. An ascending sort on _id is always added unless included in this list. This
                 is to ensure there is a unique tie-breaking field which is useful for ensuring
                 results stay the same each time they are requested and necessary to ensure the
                 correct result list responses when using the after parameter for pagination.
    :param kwargs: as a convenience we allow a kwargs parameter which we ignore, this is useful to
                   as it allows the arguments to be passed by just unpacking the data_dict
    :return: an elasticsearch-dsl Search object
    '''
    search = Search()
    # add a free text query across all fields if there is one. This searches against meta.all which
    # is a copy field created by adding the values of each data.* field
    if q is not None and q is not u'' and q is not {}:
        if isinstance(q, (str, unicode, int, float)):
            search = search.query(
                u'match', **{u'meta.all': {
                    u'query': q,
                    u'operator': u'and'
                }})
        elif isinstance(q, dict):
            for field, query in sorted(q.items(), key=operator.itemgetter(0)):
                # TODO: change this to __all__ to match __geo__?
                if field == u'':
                    field = u'meta.all'
                else:
                    field = prefix_field(field)
                search = search.query(
                    u'match',
                    **{field: {
                        u'query': query,
                        u'operator': u'and'
                    }})
    if filters is not None:
        for field, values in sorted(filters.items(),
                                    key=operator.itemgetter(0)):
            if not isinstance(values, list):
                values = [values]
            if field == u'__geo__':
                # only pass through the first value
                search = add_geo_search(search, values[0])
            else:
                field = u'{}'.format(prefix_field(field))
                for value in values:
                    # filter on the keyword version of the field
                    search = search.filter(u'term', **{field: value})

    # after and offset cannot be used together, prefer after over offset
    if after is not None:
        search = search.extra(search_after=after)
    elif offset is not None:
        search = search.extra(from_=int(offset))
    # add the limit or a default of 100 if there isn't one specified
    search = search.extra(size=int(limit) if limit is not None else 100)

    if fields is not None:
        search = search.source(map(prefix_field, fields))
    if facets is not None:
        facet_limits = facet_limits if facet_limits is not None else {}
        for facet in facets:
            # to produce the facet counts we use a bucket terms aggregation, note that using the
            # bucket function on the top level aggs attribute on the search object doesn't return a
            # copy of the search object like it does when adding queries etc
            search.aggs.bucket(facet,
                               u'terms',
                               field=prefix_field(facet),
                               size=facet_limits.get(facet, 10))

    # at least one sort is always added, on the _id column. This is necessary to ensure use of that
    # search_after is predictable (in the elasticsearch docs it recommends that a tie-breaker field
    # is present otherwise the response can include duplicates/missing records). The _id field is
    # always unique and therefore an ideal tie-breaker, so we make sure it's always in the sort
    sorts = []
    # if the caller passes in _id then we don't need to add it in again
    id_in_sort = False
    if sort is not None:
        for field_and_sort in sort:
            if not field_and_sort.endswith(
                    u' desc') and not field_and_sort.endswith(u' asc'):
                # default the sort direction to ascending if nothing is provided
                field_and_sort += u' asc'
            field, direction = field_and_sort.rsplit(u' ', 1)
            # set the id_in_sort boolean to True if we see the _id field in the caller defined sort
            id_in_sort = not id_in_sort and field == u'_id'
            field = prefix_field(field)
            # if the sort direction is desc we need to add a minus sign in front of the field name,
            # otherwise we can just use the field name on its own as the default sort is asc
            sorts.append(u'-{}'.format(field) if direction ==
                         u'desc' else field)

    # by default, sort by the _id field
    if not id_in_sort:
        sorts.append(prefix_field(u'_id'))
    search = search.sort(*sorts)

    return search
Example #35
0
def do_fulltext_search(query: FulltextQuery,
                       deep_page_limit: int = 2000) -> FulltextHits:

    search = Search(using=es_client,
                    index=settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX)

    if query.collapse_key:
        search = search.filter("term", collapse_key=query.collapse_key)
    else:
        search = search.extra(
            collapse={
                "field": "collapse_key",
                "inner_hits": {
                    "name": "more_pages",
                    "size": 0,
                },
            })

    # apply filters from query
    search = apply_filters(search, query)

    # we combined several queries to improve scoring.

    # this query use the fancy built-in query string parser
    basic_fulltext = Q(
        "query_string",
        query=query.q,
        default_operator="AND",
        analyze_wildcard=True,
        allow_leading_wildcard=False,
        lenient=True,
        quote_field_suffix=".exact",
        fields=[
            "title^4",
            "biblio_all^3",
            "everything",
        ],
    )
    has_fulltext = Q("terms",
                     **{"access_type": ["ia_sim", "ia_file", "wayback"]})
    poor_metadata = Q(
        "bool",
        should=[
            # if these fields aren't set, metadata is poor. The more that do
            # not exist, the stronger the signal.
            Q("bool", must_not=Q("exists", field="year")),
            Q("bool", must_not=Q("exists", field="type")),
            Q("bool", must_not=Q("exists", field="stage")),
            Q("bool", must_not=Q("exists", field="biblio.container_name")),
        ],
    )

    if query.filter_availability == "fulltext" or query.filter_availability is None:
        base_query = basic_fulltext
    else:
        base_query = Q("bool", must=basic_fulltext, should=[has_fulltext])

    if query.q == "*":
        search = search.query("match_all")
        search = search.sort("_doc")
    else:
        search = search.query(
            "boosting",
            positive=base_query,
            negative=poor_metadata,
            negative_boost=0.5,
        )

    # simplified version of basic_fulltext query, for highlighting
    highlight_query = Q(
        "query_string",
        query=query.q,
        default_operator="AND",
        lenient=True,
    )
    search = search.highlight(
        "abstracts.body",
        "fulltext.body",
        "fulltext.acknowledgement",
        "fulltext.annex",
        highlight_query=highlight_query.to_dict(),
        require_field_match=False,
        number_of_fragments=2,
        fragment_size=200,
        order="score",
        # TODO: this will fix highlight encoding, but requires ES 7.x
        # encoder="html",
    )

    # sort order
    if query.sort_order == "time_asc":
        search = search.sort("year", "date")
    elif query.sort_order == "time_desc":
        search = search.sort("-year", "-date")
    elif query.sort_order == "relevancy" or query.sort_order is None:
        pass
    else:
        raise ValueError(
            f"Unknown 'sort_order' parameter value: '{query.sort_order}'")

    # Sanity checks
    limit = min((int(query.limit or 15), 100))
    offset = max((int(query.offset or 0), 0))
    if offset > deep_page_limit:
        # Avoid deep paging problem.
        offset = deep_page_limit

    search = search.params(track_total_hits=True)
    search = search[offset:(offset + limit)]

    query_start = datetime.datetime.now()
    try:
        resp = search.execute()
    except elasticsearch.exceptions.RequestError as e_raw:
        # this is a "user" error
        e: Any = e_raw
        logging.warn("elasticsearch 400: " + str(e.info))
        if e.info.get("error", {}).get("root_cause", {}):
            raise ValueError(
                str(e.info["error"]["root_cause"][0].get("reason"))) from e
        else:
            raise ValueError(str(e.info)) from e
    except elasticsearch.exceptions.TransportError as e:
        # all other errors
        logging.warn("elasticsearch non-200 status code: {}".format(e.info))
        raise IOError(str(e.info)) from e
    query_delta = datetime.datetime.now() - query_start

    # convert from API objects to dicts
    results = transform_es_results(resp)

    count_found: int = 0
    if isinstance(resp.hits.total, int):
        count_found = int(resp.hits.total)
    else:
        count_found = int(resp.hits.total["value"])
    count_returned = len(results)

    # if we grouped to less than a page of hits, update returned count
    if (not query.collapse_key) and offset == 0 and (count_returned < limit):
        count_found = count_returned

    return FulltextHits(
        query_type="fulltext",
        count_returned=count_returned,
        count_found=count_found,
        offset=offset,
        limit=limit,
        deep_page_limit=deep_page_limit,
        query_time_ms=int(resp.took),
        query_wall_time_ms=int(query_delta.total_seconds() * 1000),
        results=results,
    )
Example #36
0
class ElasticsearchSearch(ABC):
    """Representa una búsqueda a realizar utilizando Elasticsearch. Dependiendo
    de los parámetros de búsqueda, se puede llegar a necesitar más de una
    consulta a Elasticsearch para completar la misma.

    Attributes:
        _search (elasticsearch_dsl.Search): Búsqueda principal a envíar a
            Elasticsearch.
        _index (str): Índice sobre el cual realizar la búsqueda principal.
        _offset (int): Cantidad de resultados a saltear ('from').
        _result (ElasticsearchResult): Resultado de la búsqueda.

    """

    __slots__ = ['_search', '_index', '_offset', '_result']

    def __init__(self, index, query):
        """Inicializa un objeto de tipo ElasticsearchSearch.

        Args:
            index (str): Ver atributo '_index'.
            query (dict): Parámetros de la búsqueda. Ver el método
                '_read_query' para tomar nota de los valores permitidos
                dentro del diccionario.

        """
        self._search = Search(index=index)
        if constants.ES_TRACK_TOTAL_HITS:
            # Configurar la cantidad máxima de hits con los que se pueden
            # calcular total de hits precisos (nuevo en Elasticsearch 7.0.0).
            self._search = self._search.extra(
                track_total_hits=constants.ES_TRACK_TOTAL_HITS)

        self._index = index
        self._offset = query.get('offset', 0)
        self._result = None

        self._read_query(**query)

    @abstractmethod
    def search_steps(self):
        """Devuelve un iterador de búsquedas elasticsearch_dsl.Search, cada una
        representando un paso requerido para completar la búsqueda
        ElasticsearchSearch.

        Cuando el iterador finaliza, el valor de 'self._result' contiene el
        resultado final de la búsqueda.

        Yields:
            elasticsearch_dsl.Search: Búsqueda DSL que se desea ejecutar. Sus
                resultados deberían ser devueltos por el invocador de
                'next()/send()'.

        """
        raise NotImplementedError()

    def _read_query(self,
                    fields=None,
                    size=constants.DEFAULT_SEARCH_SIZE,
                    offset=0):
        """Lee los parámetros de búsqueda recibidos y los agrega al atributo
        'self._search'.

        Args:
            fields (list): Lista de campos a incluir en los resultados de la
                búsqueda.
            size (int): Tamaño máximo de resultados a devolver.
            offset (int): Cantidad de resultados a saltear.

        """
        if fields:
            self._search = self._search.source(includes=fields)

        self._search = self._search[offset:offset + size]

    def _expand_intersection_query(self, geo_shape_ids):
        """Expande (comprueba) que los IDs contenidos en geo_shape_ids sean
        válidos y referencien a entidades existentes. Los IDs inválidos son
        removidos.

        Este paso es necesario ya que la búsqueda por geometrías pre-indexadas
        de Elasticsearch no acepta IDs de documentos no existentes. Si se
        intenta utilizar un ID inválido, retorna HTTP 400.

        Para realizar la búsqueda, se retorna un iterador de
        elasticsearch_dsl.Search. De esta forma, se puede utilizar este método
        desde 'search_steps', agregando instancias de elasticsearch_dsl.Search
        que deben ser ejecutadas para completar los resultados de la instancia
        de ElasticsearchSearch.

        Yields:
            elasticsearch_dsl.Search: Búsqueda DSL necesaria para completar el
                chequeo de IDs.

        Args:
            geo_shape_ids (dict): Diccionario de str - list, las keys siendo
                tipos de entidades, y los valores siendo listas de IDs para el
                tipo de entidad.

        """
        checked_ids = {}

        for entity_type in INTERSECTION_PARAM_TYPES:
            if entity_type not in geo_shape_ids:
                continue

            entity_ids = list(geo_shape_ids[entity_type])
            search_class = entity_search_class(entity_type)
            search = search_class({
                'ids': entity_ids,
                'size': len(entity_ids),
                'fields': [N.ID]
            })

            yield from search.search_steps()

            checked_ids[entity_type] = [
                hit[N.ID] for hit in search.result.hits
            ]

        self._search = self._search.query(
            _build_geo_query(N.GEOM, ids=checked_ids))

    def _expand_geometry_query(self, search_class):
        """Expande (completa) una búsqueda que incluye 'geometria' en sus
        campos. Para lograr esto, crea búsquedas elasticsearch_dsl.Search
        a los índices correspondientes que incluyen geometrías.

        Este método es necesario ya que los índices de entidades no cuentan
        con las versiones originales de las geometrías, por razones de
        performance (ver comentario en archivo es_config.py). Entonces, es
        necesario buscar las geometrías en índices separados, utilizando los
        IDs de los resultados encontrados en la búsqueda principal
        ('self._search').

        Para realizar la búsqueda de geometrías, se retorna un iterador de
        elasticsearch_dsl.Search. De esta forma, se puede utilizar este método
        desde 'search_steps', agregando instancias de elasticsearch_dsl.Search
        que deben ser ejecutadas para completar los resultados de la instancia
        de ElasticsearchSearch.

        Args:
            search_class (type): Clase a utilizar para crear el iterador de
                búsquedas.

        Yields:
            elasticsearch_dsl.Search: Búsqueda DSL necesaria para obtener las
                geometrías.

        """
        ids = [hit['id'] for hit in self._result.hits]

        geom_search = search_class({
            'ids': ids,
            'fields': [N.ID, N.GEOM],
            'size': len(ids)
        })

        yield from geom_search.search_steps()

        original_hits = {hit[N.ID]: hit for hit in self._result.hits}

        for hit in geom_search.result.hits:
            # Agregar campo geometría a los resultados originales
            original_hits[hit[N.ID]][N.GEOM] = hit[N.GEOM]

    @property
    def result(self):
        """Devuelve el resultado de la búsqueda, si esta fue ejecutada.

        Raises:
            RuntimeError: Si la búsqueda no fue ejecutada.

        Returns:
            ElasticsearchResult: Resultado de la búsqueda.

        """
        if self._result is None:
            raise RuntimeError('Search has not been executed yet')

        return self._result

    @staticmethod
    def run_searches(es, searches):
        """Ejecuta una lista de búsquedas ElasticsearchSearch.

        Para ejecutar las búsquedas, se obtiene un iterador de búsquedas
        elasticsearch_dsl.Search por cada elemento de 'searches'. Utilizando
        los iteradores, se construyen listas de elasticsearch_dsl.Search, que
        son luego ejecutadas utilizando '_run_multisearch'. Después, los
        resultados son devueltos a cada iterador, que pueden o no generar una
        nueva búsqueda elasticsearch_dsl.Search. El proceso se repite hasta que
        todos los iteradores hayan finalizado. Con todo este proceso se logra:

            1) Ejecutar cualquier tipo de búsquedas bajo una mismas interfaz.
            2) Ejecutar búsquedas que requieren distintas cantides de pasos
               bajo una misma interfaz.
            3) Utilizar la funcionalidad de MultiSearch para hacer la menor
               cantidad de consultas posible a Elasticsearch.

        Los resultados de cada búsqueda pueden ser accedidos vía el campo
        '.result' de cada una.

        Args:
            es (Elasticsearch): Conexión a Elasticsearch.
            searches (list): Lista de búsquedas ElasticsearchSearch o
                derivados. La lista puede ser de cualquier largo ya que sus
                contenidos son fraccionados por '_run_multisearch' para evitar
                consultas demasiado extensas a Elasticsearch.

        """
        iterators = [search.search_steps() for search in searches]
        iteration_data = []
        for iterator in iterators:
            search = utils.step_iterator(iterator)

            if search:
                iteration_data.append((iterator, search))

        while iteration_data:
            responses = _run_multisearch(
                es, [search for _, search in iteration_data])

            iterators = (iterator for iterator, _ in iteration_data)
            iteration_data = []

            for iterator, response in zip(iterators, responses):
                search = utils.step_iterator(iterator, response)
                if search:
                    iteration_data.append((iterator, search))
Example #37
0
class Query():
    """
    Base query class used to query elasticsearch
    """

    filters = {}
    start_date = None
    end_date = None
    interval_ = "month"
    offset_ = None

    def __init__(self, index, esfilters={}, interval=None, offset=None):
        """
        :param index: An Index object containing the connection details
        :param esfilters: TODO: this is still to be implemented
        :param interval: interval to use for timeseries data
        :param offset: TODO: this is still to be implemented
        """
        self.index = index
        self.search = Search(using=self.index.es, index=self.index.index_name)

        self.parent_agg_counter = 0
        if esfilters:
            self.filters.update(esfilters)
        # an ordered aggregation dict so that the nested aggregations can be made chainable
        self.aggregations = OrderedDict()
        self.child_agg_counter_dict = defaultdict(
            int)  # to keep a track of nested child aggregations
        self.size = 10000  # temporary hack to get all the data
        self.precision_threshold = 3000  # accuracy that we want when counting the number of items
        if interval:
            self.interval_ = interval
        if offset:
            self.offset_ = offset

    def add_query(self, key_val={}):
        """
        Add an es_dsl query object to the es_dsl Search object

        :param key_val: a key-value pair(dict) containing the query to be added to the search object
        :returns: self, which allows the method to be chainable with the other methods
        """

        q = Q("match", **key_val)
        self.search = self.search.query(q)
        return self

    def add_inverse_query(self, key_val={}):
        """
        Add an es_dsl inverse query object to the es_dsl Search object

        :param key_val: a key-value pair(dict) containing the query to be added to the search object
        :returns: self, which allows the method to be chainable with the other methods
        """

        q = Q("match", **key_val)
        self.search = self.search.query(~q)
        return self

    def is_open(self):
        """
        Add the {'state':'open'} query to the Search object

        :returns: self, which allows the method to be chainable with the other methods
        """

        self.add_query({"state": "open"})
        return self

    def is_closed(self):
        """
        Add the {'state':'closed'} query to the Search object

        :returns: self, which allows the method to be chainable with the other methods
        """

        self.add_query({"state": "closed"})
        return self

    def get_sum(self, field=None):
        """
        Create a sum aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("sum", field=field)
        self.aggregations['sum_' + field] = agg
        return self

    def get_average(self, field=None):
        """
        Create an avg aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("avg", field=field)
        self.aggregations['avg_' + field] = agg
        return self

    def get_percentiles(self, field=None, percents=None):
        """
        Create a percentile aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :param percents: the specific percentiles to be calculated
                         default: [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        if not percents:
            percents = [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
        agg = A("percentiles", field=field, percents=percents)

        self.aggregations['percentiles_' + field] = agg
        return self

    def get_terms(self, field=None):
        """
        Create a terms aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("terms", field=field, size=self.size, order={"_count": "desc"})
        self.aggregations['terms_' + field] = agg
        return self

    def get_min(self, field=None):
        """
        Create a min aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("min", field=field)
        self.aggregations['min_' + field] = agg
        return self

    def get_max(self, field=None):
        """
        Create a max aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("max", field=field)
        self.aggregations['max_' + field] = agg
        return self

    def get_cardinality(self, field=None):
        """
        Create a cardinality aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("cardinality",
                field=field,
                precision_threshold=self.precision_threshold)
        self.aggregations['cardinality_' + field] = agg
        return self

    def get_extended_stats(self, field=None):
        """
        Create an extended_stats aggregation object and add it to the aggregation dict

        :param field: the field present in the index that is to be aggregated
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            raise AttributeError(
                "Please provide field to apply aggregation to!")
        agg = A("extended_stats", field=field)
        self.aggregations['extended_stats_' + field] = agg
        return self

    def add_custom_aggregation(self, agg, name=None):
        """
        Takes in an es_dsl Aggregation object and adds it to the aggregation dict.
        Can be used to add custom aggregations such as moving averages

        :param agg: aggregation to be added to the es_dsl search object
        :param name: name of the aggregation object (optional)
        :returns: self, which allows the method to be chainable with the other methods
        """

        agg_name = name if name else 'custom_agg'
        self.aggregations[agg_name] = agg
        return self

    def since(self, start, field=None):
        """
        Add the start date to query data starting from that date
        sets the default start date for each query

        :param start: date to start looking at the fields (from date)
        :param field: specific field for the start date in range filter
                      for the Search object
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            field = "grimoire_creation_date"
        self.start_date = start

        date_dict = {field: {"gte": "{}".format(self.start_date.isoformat())}}
        self.search = self.search.filter("range", **date_dict)
        return self

    def until(self, end, field=None):
        """
        Add the end date to query data upto that date
        sets the default end date for each query

        :param end: date to stop looking at the fields (to date)
        :param field: specific field for the end date in range filter
                      for the Search object
        :returns: self, which allows the method to be chainable with the other methods
        """

        if not field:
            field = "grimoire_creation_date"
        self.end_date = end

        date_dict = {field: {"lte": "{}".format(self.end_date.isoformat())}}
        self.search = self.search.filter("range", **date_dict)
        return self

    def by_authors(self, field=None):
        """
        Used to seggregate the data with respect to the users. This method
        pops the latest aggregation from the self.aggregations dict and
        adds it as a nested aggregation under itself

        :param field: the field to create the parent agg (optional)
                      default: author_uuid
        :returns: self, which allows the method to be chainable with the other methods
        """

        # Parent aggregation
        agg_field = field if field else "author_uuid"
        agg_key = "terms_" + agg_field
        if agg_key in self.aggregations.keys():
            agg = self.aggregations[agg_key]
        else:
            agg = A("terms", field=agg_field, missing="others", size=self.size)

        child_agg_counter = self.child_agg_counter_dict[
            agg_key]  # 0 if not present because defaultdict
        child_name, child_agg = self.aggregations.popitem()

        # add child agg to parent agg
        agg.metric(child_agg_counter, child_agg)
        # insert this agg to the agg dict. This agg essentially replaces
        # the last agg that was in the agg dict
        self.aggregations[agg_key] = agg
        self.child_agg_counter_dict[agg_key] += 1
        return self

    def by_organizations(self, field=None):
        """
        Used to seggregate the data acording to organizations. This method
        pops the latest aggregation from the self.aggregations dict and
        adds it as a nested aggregation under itself

        :param field: the field to create the parent agg (optional)
                      default: author_org_name
        :returns: self, which allows the method to be chainable with the other methods
        """

        # this functions is currently only for issues and PRs
        agg_field = field if field else "author_org_name"
        agg_key = "terms_" + agg_field
        if agg_key in self.aggregations.keys():
            agg = self.aggregations[agg_key]
        else:
            agg = A("terms", field=agg_field, missing="others", size=self.size)

        child_agg_counter = self.child_agg_counter_dict[
            agg_key]  # 0 if not present because defaultdict
        child_name, child_agg = self.aggregations.popitem()

        agg.metric(child_agg_counter, child_agg)
        self.aggregations[agg_key] = agg
        self.child_agg_counter_dict[agg_key] += 1
        return self

    def by_period(self,
                  field=None,
                  period=None,
                  timezone=None,
                  start=None,
                  end=None):
        """
        Create a date histogram aggregation using the last added aggregation for the
        current object. Add this date_histogram aggregation into self.aggregations

        :param field: the index field to create the histogram from
        :param period: the interval which elasticsearch supports, ex: "month", "week" and such
        :param timezone: custom timezone
        :param start: custom start date for the date histogram, default: start date under range
        :param end: custom end date for the date histogram, default: end date under range
        :returns: self, which allows the method to be chainable with the other methods
        """

        hist_period = period if period else self.interval_
        time_zone = timezone if timezone else "UTC"

        start_ = start if start else self.start_date
        end_ = end if end else self.end_date
        bounds = self.get_bounds(start_, end_)

        date_field = field if field else "grimoire_creation_date"
        agg_key = "date_histogram_" + date_field
        if agg_key in self.aggregations.keys():
            agg = self.aggregations[agg_key]
        else:
            agg = A("date_histogram",
                    field=date_field,
                    interval=hist_period,
                    time_zone=time_zone,
                    min_doc_count=0,
                    **bounds)

        child_agg_counter = self.child_agg_counter_dict[agg_key]
        child_name, child_agg = self.aggregations.popitem()

        agg.metric(child_agg_counter, child_agg)
        self.aggregations[agg_key] = agg
        self.child_agg_counter_dict[agg_key] += 1
        return self

    def get_bounds(self, start=None, end=None):
        """
        Get bounds for the date_histogram method

        :param start: start date to set the extended_bounds min field
        :param end: end date to set the extended_bounds max field
        :returns bounds: a dictionary containing the min and max fields
                         required to set the bounds in date_histogram aggregation
        """

        bounds = {}
        if start or end:
            # Extend bounds so we have data until start and end
            start_ts = None
            end_ts = None

            if start:
                start = start.replace(microsecond=0)
                start_ts = start.replace(tzinfo=timezone.utc).timestamp()
                start_ts_ms = start_ts * 1000  # ES uses ms
            if end:
                end = end.replace(microsecond=0)
                end_ts = end.replace(tzinfo=timezone.utc).timestamp()
                end_ts_ms = end_ts * 1000  # ES uses ms

            bounds_data = {}
            if start:
                bounds_data["min"] = start_ts_ms
            if end:
                bounds_data["max"] = end_ts_ms

            bounds["extended_bounds"] = bounds_data
        return bounds

    def reset_aggregations(self):
        """
        Remove all aggregations added to the search object
        """

        temp_search = self.search.to_dict()
        if 'aggs' in temp_search.keys():
            del temp_search['aggs']
            self.search.from_dict(temp_search)
        self.parent_agg_counter = 0
        self.child_agg_counter = 0
        self.child_agg_counter_dict = defaultdict(int)

    def flush_aggregations(self):
        """
        Remove all the aggregations from the self.aggregations dict
        """

        self.aggregations = OrderedDict()

    def fetch_aggregation_results(self):
        """
        Loops though the self.aggregations dict and adds them to the Search object
        in order in which they were created. Queries elasticsearch and returns a dict
        containing the results

        :returns: a dictionary containing the response from elasticsearch
        """

        self.reset_aggregations()

        for key, val in self.aggregations.items():
            self.search.aggs.bucket(self.parent_agg_counter, val)
            self.parent_agg_counter += 1

        self.search = self.search.extra(size=0)
        response = self.search.execute()
        self.flush_aggregations()
        return response.to_dict()

    def fetch_results_from_source(self, *fields, dataframe=False):
        """
        Get values for specific fields in the elasticsearch index, from source

        :param fields: a list of fields that have to be retrieved from the index
        :param dataframe: if true, will return the data in the form of a pandas.DataFrame
        :returns: a list of dicts(key_val pairs) containing the values for the applied fields
                  if dataframe=True, will return the a dataframe containing the data in rows
                  and the fields representing column names
        """

        if not fields:
            raise AttributeError(
                "Please provide the fields to get from elasticsearch!")

        self.reset_aggregations()

        self.search = self.search.extra(_source=fields)
        self.search = self.search.extra(size=self.size)
        response = self.search.execute()
        hits = response.to_dict()['hits']['hits']
        data = [item["_source"] for item in hits]

        if dataframe:
            df = pd.DataFrame.from_records(data)
            return df.fillna(0)
        return data

    def get_timeseries(self, child_agg_count=0, dataframe=False):
        """
        Get time series data for the specified fields and period of analysis

        :param child_agg_count: the child aggregation count to be used
                                default = 0
        :param dataframe: if dataframe=True, return a pandas.DataFrame object
        :returns: dictionary containing "date", "value" and "unixtime" keys
                  with lists as values containing data from each bucket in the
                  aggregation
        """

        res = self.fetch_aggregation_results()

        ts = {"date": [], "value": [], "unixtime": []}

        if 'buckets' not in res['aggregations'][str(self.parent_agg_counter -
                                                    1)]:
            raise RuntimeError(
                "Aggregation results have no buckets in time series results.")

        for bucket in res['aggregations'][str(self.parent_agg_counter -
                                              1)]['buckets']:
            ts['date'].append(parser.parse(bucket['key_as_string']).date())
            if str(child_agg_count) in bucket:
                # We have a subaggregation with the value
                # If it is percentiles we get the median
                if 'values' in bucket[str(child_agg_count)]:
                    val = bucket[str(child_agg_count)]['values']['50.0']
                    if val == 'NaN':
                        # ES returns NaN. Convert to None for matplotlib graph
                        val = None
                    ts['value'].append(val)
                else:
                    ts['value'].append(bucket[str(child_agg_count)]['value'])
            else:
                ts['value'].append(bucket['doc_count'])
            # unixtime comes in ms from ElasticSearch
            ts['unixtime'].append(bucket['key'] / 1000)

        if dataframe:
            df = pd.DataFrame.from_records(ts, index="date")
            return df.fillna(0)
        return ts

    def get_aggs(self):
        """
        Compute the values for single valued aggregations

        :returns: the single aggregation value
        """

        res = self.fetch_aggregation_results()
        if 'aggregations' in res and 'values' in res['aggregations'][str(
                self.parent_agg_counter - 1)]:
            try:
                agg = res['aggregations'][str(self.parent_agg_counter -
                                              1)]['values']["50.0"]
                if agg == 'NaN':
                    # ES returns NaN. Convert to None for matplotlib graph
                    agg = None
            except Exception as e:
                raise RuntimeError(
                    "Multivalue aggregation result not supported")

        elif 'aggregations' in res and 'value' in res['aggregations'][str(
                self.parent_agg_counter - 1)]:
            agg = res['aggregations'][str(self.parent_agg_counter -
                                          1)]['value']

        else:
            agg = res['hits']['total']

        return agg

    def get_list(self, dataframe=False):
        """
        Compute the value for multi-valued aggregations

        :returns: a dict containing 'keys' and their corresponding 'values'
        """

        res = self.fetch_aggregation_results()
        keys = []
        values = []
        for bucket in res['aggregations'][str(self.parent_agg_counter -
                                              1)]['buckets']:
            keys.append(bucket['key'])
            values.append(bucket['doc_count'])

        result = {"keys": keys, "values": values}
        if dataframe:
            result = pd.DataFrame.from_records(result)
        return result
Example #38
0
def ESQuery(
        index=None,
        searchfields=None,
        returnfields=None,
        query=None,  # search_term
        source_fields=None,
        aggregations=None,
        advanced=None,  # used to add additional 'must' advanced search fields to search
        advanced_should=None,  # used to add additional 'must' advanced search fields to search
        sort_order=[],
        nested=None,
        nested_must=None,
        nested_should=None,
        nested_range=None,
        preference="_primary_first",
        explain=None,
        offset=0,
        limit=15,
        fuzziness=1):
    s = None
    s = Search(using=client, index=index).source(returnfields)
    s = s.params(preference=preference)
    q = None
    queries = []

    if nested is not None:
        print("Adding nested search")
        for nest in nested:
            queries.append(
                Q("nested",
                  path=nest[0],
                  query=Q("match", **{nest[1]: nest[2]})))

    if nested_must is not None:
        for nest in nested_must:
            queries.append(
                Q("nested",
                  path=nest[0],
                  query=Q("match", **{nest[1]: nest[2]})))

        s = s.query(Q('bool', must=queries))[offset:limit]
        queries = []

    if nested_should is not None:
        for nest in nested_should:
            queries.append(
                Q("nested",
                  path=nest[0],
                  query=Q("match", **{nest[1]: nest[2]})))

        s = s.query(Q('bool', should=queries))[offset:limit]

        if len(nested_should) > 1:
            s.query.minimum_should_match = 1

        queries = []

    if nested_range is not None:
        for nest in nested_range:
            queries.append(
                Q("nested",
                  path=nest[0],
                  filter=Q("range", **{nest[1]: {
                                           nest[2]: nest[3]
                                       }})))
        s = s.query(Q('bool', must=queries))
        queries = []

    if advanced is not None:
        raw_queries = []
        for advance in advanced:
            print("Adding match search on %s" % advance[0])
            raw_queries.append(Q("match", **{advance[0]: advance[1]}))

            s = s.query(Q('bool', must=raw_queries))

    if advanced_should is not None:
        for advance in advanced_should:
            queries.append(Q("match", **{advance[0]: advance[1]}))
        s = s.query(Q('bool', should=queries))[offset:limit]
        queries = []

    if searchfields is not None:
        if query is not None and query != '':
            queries.append(
                Q("simple_query_string",
                  query=query,
                  default_operator="and",
                  flags="PREFIX|PHRASE|NOT|AND|OR",
                  fields=searchfields))

    if len(queries) > 1:
        s = s.query(Q('bool', should=queries))[offset:limit]
    elif len(queries) > 0:
        s = s.query(Q('bool', must=queries[0]))[offset:limit]

    if source_fields is not None:
        s = s.extra(_source={'include': source_fields})

    if len(sort_order) > 0:
        s = s.sort(sort_order[0])

    if aggregations is not None:
        for agg in aggregations:
            a = A('terms', field=agg[1], size=10)
            s.aggs.bucket(agg[0], a)

    s = s.extra(explain=explain)

    results = s.execute()
    print("Query: ", json.dumps(s.to_dict()))

    print("Results: ", json.dumps(results.to_dict()))
    return results
def search(search_params,
           index,
           page_size,
           ip,
           request,
           filter_dead,
           page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Apply term filters. Each tuple pairs a filter's parameter name in the API
    # with its corresponding field in Elasticsearch. "None" means that the
    # names are identical.
    filters = [('extension', None), ('categories', None),
               ('aspect_ratio', None), ('size', None), ('source', 'provider'),
               ('license', 'license__keyword'),
               ('license_type', 'license__keyword')]
    for tup in filters:
        api_field, elasticsearch_field = tup
        s = _apply_filter(s, search_params, api_field, elasticsearch_field)
    # Get suggestions for any route
    s = s.suggest('get_suggestion', '', term={'field': 'creator'})
    # Exclude mature content unless explicitly enabled by the requester
    if not search_params.data['mature']:
        s = s.exclude('term', mature=True)
    # Hide data sources from the catalog dynamically.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = models.ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    to_exclude = [f['provider_identifier'] for f in filtered_providers]
    s = s.exclude('terms', provider=to_exclude)

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query('simple_query_string', query=query, fields=search_fields)
        # Get suggestions for term query
        s = s.suggest('get_suggestion', query, term={'field': 'creator'})
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query('simple_query_string',
                        query=creator,
                        fields=['creator'])
            # Get suggestions for creator
            s = s.suggest('get_suggestion', creator, term={'field': 'creator'})
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query('simple_query_string', query=title, fields=['title'])
            # Get suggestions for title
            s = s.suggest('get_suggestion', title, term={'field': 'title'})
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query('simple_query_string',
                        fields=['tags.name'],
                        query=tags)
            # Get suggestions for tags
            s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'})
    # Boost by popularity metrics
    if POPULARITY_BOOST:
        queries = []
        factors = ['comments', 'views', 'likes']
        boost_factor = 100 / len(factors)
        for factor in factors:
            rank_feature_query = Q('rank_feature',
                                   field=factor,
                                   boost=boost_factor)
            queries.append(rank_feature_query)
        s = Search().query(
            Q('bool', must=s.query, should=queries, minimum_should_match=1))

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip), request_timeout=7)
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    try:
        search_response = s.execute()
        log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}')
    except RequestError as e:
        raise ValueError(e)
    results = _post_process_results(s, start, end, page_size, search_response,
                                    request, filter_dead)

    suggestion = _query_suggestions(search_response)

    result_count, page_count = _get_result_and_page_count(
        search_response, results, page_size)

    return results, page_count, result_count, suggestion
Example #40
0
    def get(self, request):
        """GET handler."""
        q = request.GET.get('query_string')
        offset = int(request.GET.get('offset', 0))
        limit = int(request.GET.get('limit', 10))
        if limit > 500:
            return HttpResponseBadRequest("limit must not exceed 500")
        type_filter = request.GET.get('type_filter', 'all')
        doc_type_map = {
            list(Index(settings.ES_INDEX_PREFIX.format('publications')).get_alias().keys())[0]: 'publication',
            list(Index(settings.ES_INDEX_PREFIX.format('publications-legacy')).get_alias().keys())[0]: 'publication',
            list(Index(settings.ES_INDEX_PREFIX.format('files')).get_alias().keys())[0]: 'file',
            list(Index(settings.ES_INDEX_PREFIX.format('cms')).get_alias().keys())[0]: 'modelresult'
        }

        public_files_query = CommunityDataSearchManager(request).construct_query() | PublishedDataSearchManager(request).construct_query()
        publications_query = PublicationsSiteSearchManager(request).construct_query()
        cms_query = es_query = CMSSearchManager(request).construct_query()

        if type_filter == 'public_files':
            es_query = Search().query(public_files_query)
        elif type_filter == 'published':
            es_query = Search().query(publications_query)
        elif type_filter == 'cms':
            es_query = Search().query(cms_query).highlight(
                    'body',
                    fragment_size=100).highlight_options(
                    pre_tags=["<b>"],
                    post_tags=["</b>"],
                    require_field_match=False)
        elif type_filter == 'all':
            es_query = Search().query(public_files_query | publications_query | cms_query).highlight(
                    'body',
                    fragment_size=100).highlight_options(
                    pre_tags=["<b>"],
                    post_tags=["</b>"],
                    require_field_match=False)
        es_query = es_query.extra(from_=offset, size=limit)
        try:
            res = es_query.execute()
        except (TransportError, ConnectionTimeout) as err:
            if getattr(err, 'status_code', 500) == 404:
                raise
            res = es_query.execute()

        out = {}
        hits = []

        for r in res:
            d = r.to_dict()
            d["doc_type"] = doc_type_map[r.meta.index]
            if hasattr(r.meta, 'highlight'):
                highlight = r.meta.highlight.to_dict()
                d["highlight"] = highlight
            if r.meta.doc_type == 'publication' and hasattr(r, 'users'):
                users = r.users
                pi = r.project.value.pi
                pi_user = [x for x in users if x.username==pi][0]
                d["piLabel"] = "{}, {}".format(pi_user.last_name, pi_user.first_name)
            hits.append(d)

        out['hits'] = hits
        out['all_total'] = Search().query(public_files_query | publications_query | cms_query).count()
        out['public_files_total'] = Search().query(public_files_query).count()
        out['published_total'] = Search().query(publications_query).count()
        out['cms_total'] = Search().query(cms_query).count()
        print(out)

        return JsonResponse(out, safe=False)