Python Search.sort Examples, elasticsearch_dsl.Search.sort Python Examples

Example #1

0

Show file

File: search_api.py Project: thailh12/wier-backend

def search(query,
           order=None,
           maxPrice=None,
           minPrice=None,
           sortBy=None,
           page=None):
    s = Search(using=es, index="shopee")
    q = Q("multi_match", query=query, fields=['item_name', 'item_name.folded'])
    s = s.query(q)
    if sortBy == 'ctime':
        s = s.sort({"item_ctime": {"order": "desc"}})
    elif sortBy == 'sales':
        s = s.sort({"item_sold": {"order": "desc"}})
    elif sortBy == 'price' and order != "desc":
        s = s.sort({"item_price": {"order": "asc"}})
    elif sortBy == 'price' and order == "desc":
        s = s.sort({"item_price": {"order": "desc"}})
    item_price = {}
    if minPrice:
        minPrice = int(minPrice) * 100000
        item_price['gte'] = str(minPrice)
    if maxPrice:
        maxPrice = int(maxPrice) * 100000
        item_price['lte'] = maxPrice
    if item_price:
        print(item_price)
        s = s.filter('range', item_price=item_price)
    page = int(page) if page else 0
    s = s[20 * page:20 * (page + 1)]
    response = s.execute()
    results = response['hits']['hits']
    return results

Example #2

0

Show file

 def fetch_events(self, query):
     search = Search(**self.get_search_kwargs())
     search = search.query(query)
     search.sort('-@timestamp')
     response = search.execute()
     results = get_response_dict(response)
     return results[0]

Example #3

0

Show file

def es_get_accounts(main_chain='eos',
                    user_id=None,
                    start_vol=None,
                    end_vol=None,
                    from_idx=0,
                    size=100,
                    order='totalEos',
                    fields=None):
    index = '{}_account'.format(main_chain)

    if not fields:
        fields = [
            'id', 'timestamp', 'updateTimestamp', 'userId', 'totalEos',
            'liquidEos', 'stackedEos', 'unstackingEos'
        ]

    if user_id:
        s = Search(using=es_client, index=index, doc_type='doc') \
            .filter('term', userId=user_id)
    elif start_vol and end_vol:
        range = {order: {'gte': start_vol, 'lt': end_vol}}
        s = Search(using=es_client, index=index, doc_type='doc') \
            .source(include=fields) \
            .filter('range', **range)
        s = s.sort({order: {"order": "desc"}})
    else:
        s = Search(using=es_client, index=index,
                   doc_type='doc').source(include=fields)
        s = s.sort({order: {"order": "desc"}})

    resp = s[from_idx:from_idx + size].execute()

    return es_resp_to_payload(resp)

Example #4

0

Show file

def getUSWDSquery(indexbase, query, version, agency, domaintype, sort):
    index = indexbase + '-uswds2'
    try:
        query = int(query)
    except:
        query = 0

    s = Search(using=es, index=index)
    if sort == 'Score':
        s = s.sort('-data.total_score')
    else:
        s = s.sort('domain')
    s = s.query(Bool(should=[Range(data__total_score={'gte': query})]))
    if version != 'all versions':
        if version == 'detected versions':
            s = s.query("query_string", query='v*', fields=['data.uswdsversion'])
        else:
            versionquery = '"' + version + '"'
            s = s.query("query_string", query=versionquery, fields=['data.uswdsversion'])
    if agency != 'All Agencies':
        agencyquery = '"' + agency + '"'
        s = s.query("query_string", query=agencyquery, fields=['agency'])
    if domaintype != 'All Branches':
        domaintypequery = '"' + domaintype + '"'
        s = s.query("query_string", query=domaintypequery, fields=['domaintype'])

    return s

Example #5

0

Show file

File: api.py Project: islahudinees/hepdata

def get_all_ids(index=None, id_field='recid', last_updated=None, latest_first=False):
    """Get all record or inspire ids of publications in the search index

    :param index: name of index to use.
    :param id_field: elasticsearch field to return. Should be 'recid' or 'inspire_id'
    :return: list of integer ids
    """
    if id_field not in ('recid', 'inspire_id'):
        raise ValueError('Invalid ID field %s' % id_field)

    search = Search(using=es, index=index) \
        .filter("term", doc_type=CFG_PUB_TYPE) \
        .source(fields=[id_field])

    if last_updated:
        search = search.filter("range", **{'last_updated': {'gte': last_updated.isoformat()}})

    if latest_first:
        search = search.sort({'last_updated' : {'order' : 'desc'}})
    else:
        search = search.sort('recid')

    search = search.params(preserve_order=True)

    return [int(h[id_field]) for h in search.scan()]

Example #6

0

Show file

File: items.py Project: Izueh/directdemo2

    def post(self):
        json = request.get_json()
        s = Search(using=es,index='twitter',doc_type='items')
        username = json.pop('username') if 'username' in json else None
        following = True
        if 'following' in json:
            following = json.pop('following')
        else:
            if username:
                following = True
        timestamp = json.pop('timestamp') if 'timestamp' in json else time()
        search = 'q' in json
        limit = json.pop('limit') if 'limit' in json and json['limit'] <= 100 else 50
        following_list = db.user.find_one({'username':session['username']})['following'] # do we only need this is following=true?
        #query = {'timestamp':{'$lte':timestamp}}
        s = s.filter('range', timestamp={'lte':timestamp})
        if search:
            #query['$text'] = {'$search':json['q']}
            s = s.query('match', content=json['q'])
        if username:
            if following:
                query['username'] = username if username in following_list else ''
                s = s.filter('term',username=query['username'])
            else:
                #query['username'] = username
                s =s.filter('term',username=username)
        else:
            if following:
                #query['username'] = {'$in': following_list}
                s = s.filter('terms',username=following_list)
        # my code        
        if 'parent' in json:
            #query['parent'] = json['parent']
            s = s.filter('term', parent=json['parent'])
        if 'replies' not in json:
            json['replies'] = True
        if not json['replies']:
            query['parent'] = None
            s = s.filter('term', parent=None)
        # endmy code        
        if 'rank' not in json:
            json['rank'] = 'interest'
        s =s[0:limit]

        if json['rank'] == 'time':
            sort_key = 'timestamp'
            s = s.sort('-timestamp')
        else:
            sort_key = 'interest_score'
            s = s.sort('-interest_score')
        #sort_dir = -1

        #results = db.items.find(query).sort(sort_key, sort_dir).limit(limit)
        #results = db.items.find(filter=query, limit=limit, sort=sort_by)
        #results = db.items.aggregate([{'$match':query}, {'$limit': limit}, {'$sort': sort_by}])
        results = s.execute()
        l = [x['_source'].to_dict() for x in results['hits']['hits']]
        return Response(response = dumps({'status':'OK','items':l}),mimetype='application/json')

Example #7

0

Show file

File: general_search.py Project: ymeles2/Full-Stack-Projects

 def get_all_articles(self):
     '''
     Get all articles from the index.
     '''
     s = Search(index="article")
     s = s.query("match_all")
     s.sort('created', {'order': 'desc'})  #sort in descending order
     response = self.get_response(s)
     return response

Example #8

0

Show file

    def _search(self, query):
        s = Search(using=self.Client, index="winlogbeat-*").query(query)

        if self.DTRange != None:
            s = s.filter('range', **self.DTRange)

        s.source(includes=['winlog.*'])
        s.sort('-winlog.event_data.UtcTime')

        if self.Scan:
            return s.scan()
        else:
            return s.execute().hits

Example #9

0

Show file

File: elasticsearch_service.py Project: NoopDog/azul

    def _apply_paging(self,
                      catalog: CatalogName,
                      es_search: Search,
                      pagination: Mapping[str, Any]):
        """
        Applies the pagination to the ES Search object
        :param catalog: The name of the catalog to query
        :param es_search: The ES Search object
        :param pagination: Dictionary with raw entries from the GET Request.
        It has: 'size', 'sort', 'order', and one of 'search_after', 'search_before', or 'from'.
        :return: An ES Search object where pagination has been applied
        """
        # Extract the fields for readability (and slight manipulation)

        _sort = pagination['sort'] + '.keyword'
        _order = pagination['order']

        field_type = self.field_type(catalog, tuple(pagination['sort'].split('.')))
        _mode = field_type.es_sort_mode

        def sort_values(sort_field, sort_order, sort_mode):
            assert sort_order in ('asc', 'desc'), sort_order
            return (
                {
                    sort_field: {
                        'order': sort_order,
                        'mode': sort_mode,
                        'missing': '_last' if sort_order == 'asc' else '_first'
                    }
                },
                {
                    '_uid': {
                        'order': sort_order
                    }
                }
            )

        # Using search_after/search_before pagination
        if 'search_after' in pagination:
            es_search = es_search.extra(search_after=pagination['search_after'])
            es_search = es_search.sort(*sort_values(_sort, _order, _mode))
        elif 'search_before' in pagination:
            es_search = es_search.extra(search_after=pagination['search_before'])
            rev_order = 'asc' if _order == 'desc' else 'desc'
            es_search = es_search.sort(*sort_values(_sort, rev_order, _mode))
        else:
            es_search = es_search.sort(*sort_values(_sort, _order, _mode))

        # fetch one more than needed to see if there's a "next page".
        es_search = es_search.extra(size=pagination['size'] + 1)
        return es_search

Example #10

0

Show file

File: default.py Project: yeyingtomorrow/Stream4Flow

def get_summary_statistics():
    """
    Obtains statistics about current sum of flows, packets, bytes.

    :return: JSON with status "ok" or "error" and requested data.
    """

    try:
        # Elastic query
        client = elasticsearch.Elasticsearch([{
            'host':
            myconf.get('consumer.hostname'),
            'port':
            myconf.get('consumer.port')
        }])
        elastic_bool = []
        elastic_bool.append(
            {'range': {
                '@timestamp': {
                    'gte': "now-5m",
                    'lte': "now"
                }
            }})
        elastic_bool.append({'term': {'@type': 'protocols_statistics'}})

        qx = Q({'bool': {'must': elastic_bool}})
        s = Search(using=client, index='_all').query(qx)
        s.aggs.bucket('sum_of_flows', 'sum', field='flows')
        s.aggs.bucket('sum_of_packets', 'sum', field='packets')
        s.aggs.bucket('sum_of_bytes', 'sum', field='bytes')
        s.sort('@timestamp')
        result = s.execute()

        # Result Parsing into CSV in format: timestamp, tcp protocol value, udp protocol value
        data = "Timestamp, Flows, Packets, Bytes;"
        timestamp = "Last 5 Minutes"
        data += timestamp + ', ' +\
                str(int(result.aggregations.sum_of_flows['value'])) + ', ' +\
                str(int(result.aggregations.sum_of_packets['value'])) + ', ' +\
                str(int(result.aggregations.sum_of_bytes['value']))

        json_response = '{"status": "Ok", "data": "' + data + '"}'
        return json_response

    except Exception as e:
        json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(
            str(e)) + '"}'
        return json_response

Example #11

0

Show file

File: search.py Project: vtalks/vtalks.net

def search_more_like_this(talk):
    """ Get more like this documents
    """
    client = Elasticsearch([{
        'host':
        settings.ELASTICSEARCH['default']['HOSTNAME'],
        'port':
        settings.ELASTICSEARCH['default']['PORT'],
    }])

    s = Search(using=client, index="vtalks")

    s = s.query(
        MoreLikeThis(like={
            "_index": "vtalks",
            "_type": "talk",
            "_id": talk.id
        },
                     fields=['title', 'description', 'tags']))

    # Sorting
    s = s.sort({"_score": {"order": "desc"}})

    # Fields selection
    s = s.source(['id'])

    response = s.execute()

    results_total = response.hits.total
    results_ids = [hit.id for hit in response.hits]

    return results_total, results_ids

Example #12

0

Show file

File: elasticlog.py Project: gustavohenrique/elasticlog

    def search(self, **params):
        index = params.get('index', self.index)
        search = Search(using=self.client, index=index)

        page = params.get('page', None)
        per_page = params.get('per_page', None)
        if page and per_page:
            page = page - 1
            search._extra = {'from': page, 'size': per_page}

        sort = params.get('sort', None)
        if sort and sort.replace('-', '') in ['created_at', 'level']:
            search = search.sort(sort)

        date_filter = self._filter_by_date_interval(params)
        if date_filter:
            search = search.filter(date_filter)

        level = params.get('group_by', None)
        if level:
            search = search.query('match', level=level)

        hits = search.execute()

        format = params.get('format', 'object')
        if format == 'dict':
            return self._to_dict(hits)
        else:
            return self._to_logs(hits)

Example #13

0

Show file

def _create_search(dbm, form_model, local_time_delta, pagination_params,
                   sort_params, search_parameters):
    es = Elasticsearch(hosts=[{
        "host": ELASTIC_SEARCH_HOST,
        "port": ELASTIC_SEARCH_PORT
    }])
    search = Search(using=es, index=dbm.database_name, doc_type=form_model.id)
    search = search.sort(sort_params)
    search = search.extra(**pagination_params)
    search = search.query('match', status='Success')
    search = search.query('term', void=False)
    if search_parameters.get('data_sender_filter'):
        search = search.query(
            "term",
            **{"datasender.id": search_parameters.get('data_sender_filter')})
    if search_parameters.get('unique_id_filters'):
        search = _add_unique_id_filters(
            form_model, search_parameters.get('unique_id_filters'), search)
    if search_parameters.get('date_question_filters'):
        for key, values in search_parameters.get(
                'date_question_filters').iteritems():
            query = DateQuestionRangeFilter(values['dateRange'], form_model,
                                            key).build_filter_query()
            if query is not None:
                search = search.query(query)
    if search_parameters.get('search_text'):
        query_text_escaped = ElasticUtilsHelper().replace_special_chars(
            search_parameters.get('search_text'))
        search = search.query("query_string", query=query_text_escaped)
    submission_date_query = SubmissionDateRangeFilter(
        search_parameters.get('submission_date_range'),
        local_time_delta).build_filter_query()
    if submission_date_query:
        search = search.query(submission_date_query)
    return search

Example #14

0

Show file

File: contractfinder.py Project: uk-gov-mirror/datagovuk.contracts-archive

def make_query(query, filters, page, sort_by):
    try:
        client = Elasticsearch()
        s = Search(client, index=app.config['INDEX'])

        if query:
            s = s.query(QueryString(query=escape_query(query)))
            if not sort_by:
                sort_by = "relevance"
        else:
            s = s.query(MatchAll())
            if not sort_by:
                sort_by = DEFAULT_SORT_BY

        s = s.sort(SORT_BY.get(sort_by, DEFAULT_SORT_BY)['value'])

        start = (page - 1) * 20
        end = start + 20
        s = s[start:end]

        if filters:
            s = s.filter('bool', must=filters)

        result = s.execute()
        return result
    except ConnectionError, ex:
        return None

Example #15

0

Show file

File: eshelper.py Project: ethereum/system-testing

def consensus(offset=60):
    """
    check for 'eth.chain.new_head' messages
    and return the max number of clients, that had the same head
    during the last `offset` seconds.
    """
    s = Search(client)
    # s = s.query(Q('match', message='eth.chain.new_head'))
    s = s.filter('exists', field='json_message.eth.chain.new_head.block_number')
    s = s.sort({'json_message.eth.chain.new_head.ts': {'order': 'desc', 'ignore_unmapped': 'true'}})
    response = s.execute()

    # Get latest block number
    x = max(hit['_source']['json_message']['eth.chain.new_head']['block_number'] for hit in response.hits.hits)

    # By default, the buckets are ordered by their doc_count descending
    # s.aggs.bucket('by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3)

    # Reach consensus around latest block number
    s = Search(client)
    s = s.filter(time_range_filter(field="json_message.eth.chain.new_head.ts", offset=offset))
    s.aggs.bucket('latest', 'range',
                  field='json_message.eth.chain.new_head.block_number',
                  ranges=[{"from": x - 1, "to": x + 1}]).bucket(
                      'by_block_hash', 'terms',
                      field='json_message.eth.chain.new_head.block_hash',
                      size=3)
    # s = s[10:10]
    response = s.execute()
    # pprint(response)

    if response:
        return max(tag.doc_count for tag in response.aggregations.latest.buckets[0].by_block_hash.buckets)
    else:
        return 0

Example #16

0

Show file

File: first_commit.py Project: alpgarcia/grimoirecon18

def main():
    """Query ES to get first and last commit of each author together with
    some extra info like .
    """
    es_conn = create_conn()

    # Create search object
    s = Search(using=es_conn, index='git')

    # FILTER: retrieve commits before given year
    s = s.filter('range', grimoire_creation_date={'lt': 'now/y'})

    # Bucketize by uuid and get first and last commit (commit date is stored in
    # author_date field)
    s.aggs.bucket('authors', 'terms', field='author_uuid', size=10000000) \
        .metric('first', 'top_hits',
                _source=['author_date', 'author_org_name', 'author_uuid', 'project'],
                size=1,
                sort=[{"author_date": {"order": "asc"}}]) \
        .metric('last_commit', 'max', field='author_date')

    # Sort by commit date
    s = s.sort("author_date")

    #print(s.to_dict())
    result = s.execute()

    # Print result
    print(json.dumps(result.to_dict()['aggregations'], indent=2, sort_keys=True))

Example #17

0

Show file

 def build_file_es(self, args, search_condition):
     file_es = Search() \
         .query(search_condition)
     file_es = file_es.sort(*self.sort_condition(args))
     file_es = self.add_custom_source(file_es, args)
     file_es = self.add_page_limit_to_file_es(args, file_es)
     return file_es

Example #18

0

Show file

def fetch_incidents():
    last_fetch, last_fetch_timestamp = get_last_fetch_time()
    es = elasticsearch_builder()

    query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*")
    # Elastic search can use epoch timestamps (in milliseconds) as date representation regardless of date format.
    search = Search(using=es, index=FETCH_INDEX).filter(
        {'range': {
            TIME_FIELD: {
                'gt': last_fetch_timestamp
            }
        }})
    search = search.sort({TIME_FIELD: {
        'order': 'asc'
    }})[0:FETCH_SIZE].query(query)
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    incidents = []  # type: List

    if total_results > 0:
        if 'Timestamp' in TIME_METHOD:
            incidents, last_fetch = results_to_incidents_timestamp(
                response, last_fetch)
            demisto.setLastRun({'time': last_fetch})

        else:
            incidents, last_fetch = results_to_incidents_datetime(
                response, last_fetch)
            demisto.setLastRun({'time': str(last_fetch)})

        demisto.info('extract {} incidents'.format(len(incidents)))

    demisto.incidents(incidents)

Example #19

0

Show file

    def search(self, query):
        skip = int(query.pop("skip", 0))
        s = Search(using=es, index=config['index']['resolver'])
        if self.uuid:
            s = s.query('match', _dataset=self.uuid)
        if self.core:
            s = s.query('match', _core=self.core)
        for k, v in query.iteritems():
            options = config['search'].get(k, {})
            if not v or str(v) == "":
                continue
            if ".kw" in k or options.get('type') == "keyword":
                s = s.filter('term', **{k.replace(".term", ""): v})
            elif ".prefix" in k or options.get('type') == "prefix":
                s = s.query('prefix', **{k.replace(".prefix", ""): v.lower()})
            elif ".fuzzy" in k or options.get('type') == "fuzzy":
                s = s.query('fuzzy', **{k.replace(".fuzzy", ""): v.lower()})
            elif ".term" in k or options.get("type") == "term":
                s = s.query('term', **{k.replace(".term", ""): v.lower()})
            else:
                s = s.query('match', **{k: v})

        a = A('geo_bounds', field='_location')
        s.aggs.bucket('viewport', a)
        s = s.sort('_id')
        s = s[skip:skip + 50]
        result = s.execute()
        query['skip'] = skip
        return result

Example #20

0

Show file

File: models.py Project: densho/encyc-front

 def events():
     """Returns list of Event objects.
     
     @returns: list
     """
     s = Search(doc_type='events')[0:MAX_SIZE]
     s = s.sort('start_date')
     s = s.fields([
         'id',
         'published',
         'title',
         'description',
         'start_date',
         'end_date',
         'article_title',
         'resource_uri',
     ])
     response = s.execute()
     data = [
         Event(
             id = hit.meta.id,
             published = hitvalue(hit, 'published'),
             title = hitvalue(hit, 'title'),
             description = hitvalue(hit, 'description'),
             start_date = hitvalue(hit, 'start_date'),
             end_date = hitvalue(hit, 'end_date'),
             article_title = hitvalue(hit, 'article_title'),
             resource_uri = hitvalue(hit, 'resource_uri'),
         )
         for hit in response
     ]
     return data

Example #21

0

Show file

File: views.py Project: kingsley136/crawling-data

    def get_elk_response(self, request, task_id):
        # FIXME try to use django-rest-elasticsearch instead
        page = int(request.GET.get('page')) if request.GET.get('page') else 0
        limit = int(
            request.GET.get('limit')) if request.GET.get('limit') else 20
        if request.GET.get('order'):
            field, order = request.GET.get('order').split(',')
            sort_option = {field: {"order": order}}
        else:
            sort_option = {}
        client = Elasticsearch(hosts=[ELK_HOST + ':9200'],
                               http_auth=('elastic', 'L5M3LPXk6QhxTyZenwo5'))

        s = Search(using=client, index="logstash*").query("match",
                                                          task_id=task_id)
        if request.GET.get('category'):
            s = s.query(
                Match(categories={"query": request.GET.get('category')}))

        s = s.sort(sort_option)[(page * limit):(page * limit + limit)]

        try:
            elk_response = s.execute()
        except RequestError as exc:
            logging.warning(exc)
            return Response({"Message": "Wrong query!"})
        return elk_response, limit, page

Example #22

0

Show file

File: views.py Project: fnet123/galaxy-1

 def get(self, request, *args, **kwargs):
     q = None
     page = 0
     page_size = 10
     order_fields = []
     for key,value in request.GET.items():
         if key in ('tag','content','autocomplete'):
             q = Q('match', tag=value)
         if key == 'page':
             page = int(value) - 1 if int(value) > 0 else 0
         if key == 'page_size':
             page_size = int(value)
         if key in ('order', 'orderby'):
             order_fields = value.split(',')
     if page_size > 1000:
         page_size = 1000
     s = Search(index='galaxy_tags')
     s = s.query(q) if q else s
     s = s.sort(*order_fields) if len(order_fields) > 0 else s
     s = s[page * page_size:page * page_size + page_size]
     result = s.execute()
     serializer = ElasticSearchDSLSerializer(result.hits, many=True)
     response = get_response(request=request, result=result, view='api:tags_search_view')
     response['results'] = serializer.data
     return Response(response)

Example #23

0

Show file

File: esdb.py Project: cti-galaxy/taxii

    def search(self,
               index: str,
               query_string: QueryString,
               search_from: int,
               size: int,
               sort_by: dict = None,
               fields: list = None):
        results = []
        more = False
        search = Search(
            using=self.client,
            index=index).query(query_string).source(fields)[search_from:size]
        search = search.sort(sort_by)
        search_results = search.execute().to_dict()
        total = int(search_results['hits']['total']['value'])

        for result in search_results['hits']['hits']:
            response = {}
            response.update(result['_source'])
            results.append(response)

        if -1 < size < total:
            more = True

        return {
            "more": more,
            "objects": results,
        }

Example #24

0

Show file

def generateRawQuery(user, starttime, endtime):
    """
    Generate the raw query to get all usage for a user between starttime and endtime.
    """
    es = Elasticsearch(
        [GRACC],
        timeout=300,
        use_ssl=True,
        verify_certs=True
    )
    endtime = endtime + datetime.timedelta(days=1)
    MAXSZ = 2 ** 30
    index = "gracc.osg.raw*"
    s = Search(using=es, index=index)
    # Starttime and endtime are both datetime objects
    print("Querying for user {} between {} and {}".format(user, starttime, endtime))
    s = s.query(
        "bool",
        filter=[
            Q("range", EndTime={"gte": starttime, "lt": endtime})
            & Q("term", ResourceType="Payload")
            & Q("term", DN=user) 
        ],
    )
    s = s.sort("StartTime")
    #print(s.to_dict())
    return s

Example #25

0

Show file

File: models.py Project: densho/encyc-front

 def locations():
     """Returns list of Location objects.
     
     @returns: list
     """
     s = Search(doc_type='location')[0:MAX_SIZE]
     s = s.sort('id')
     s = s.fields([
         'id',
         'category',
         'title',
         'location_name',
         'description',
         'lat',
         'lng',
         'resource_uri',
         'location_uri',
         'location_url',
     ])
     response = s.execute()
     return [
         Location(
             id = hitvalue(hit, 'id'),
             category = hitvalue(hit, 'category'),
             title = hitvalue(hit, 'title'),
             location_name = hitvalue(hit, 'location_name'),
             description = hitvalue(hit, 'description'),
             lat = hitvalue(hit, 'lat'),
             lng = hitvalue(hit, 'lng'),
             resource_uri = hitvalue(hit, 'resource_uri'),
             location_uri = hitvalue(hit, 'location_uri'),
             location_url = hitvalue(hit, 'location_url'),
         )
         for hit in response
     ]

Example #26

0

Show file

    def test_paginator(self):
        search = Search(index=Token.es_doc_type._doc_type.index,
                        doc_type=Token.es_doc_type._doc_type.name)
        search = search.sort('name')

        page_size = 2
        paginator = ESSearchPaginator(search, page_size)

        page = paginator.page(1)

        self.assertTrue(page.has_other_pages)
        self.assertEqual(len(page.hits), page_size)
        self.assertEqual(page.total_count, 3)

        self.assertEqual(page.hits[0]['name'], 'token 0')
        self.assertEqual(page.hits[1]['name'], 'token 1')

        self.assertEqual(page.paginator, paginator)
        self.assertEqual(page.number, 1)
        self.assertIsNotNone(page.response)

        page = paginator.page(2)

        self.assertFalse(page.has_other_pages)
        self.assertEqual(len(page.hits), 1)

        self.assertEqual(page.hits[0]['name'], 'token 2')

Example #27

0

Show file

File: elastic.py Project: densho/encyc-front

 def pages():
     """Returns list of published light Page objects.
     
     @returns: list
     """
     KEY = 'encyc-front:pages'
     TIMEOUT = 60*5
     data = cache.get(KEY)
     if not data:
         s = Search(doc_type='articles').filter('term', published_encyc=True)[0:MAX_SIZE]
         s = s.sort('title_sort')
         s = s.fields([
             'url_title',
             'title',
             'title_sort',
             'published',
             'modified',
             'categories',
         ])
         response = s.execute()
         data = [
             Page(
                 url_title  = hitvalue(hit, 'url_title'),
                 title      = hitvalue(hit, 'title'),
                 title_sort = hitvalue(hit, 'title_sort'),
                 published  = hitvalue(hit, 'published'),
                 modified   = hitvalue(hit, 'modified'),
                 categories = hit.get('categories',[]),
                )
             for hit in response
             if hitvalue(hit, 'published')
         ]
         cache.set(KEY, data, TIMEOUT)
     return data

Example #28

0

Show file

File: elastic.py Project: densho/encyc-front

 def authors(num_columns=None):
     """Returns list of published light Author objects.
     
     @returns: list
     """
     KEY = 'encyc-front:authors'
     TIMEOUT = 60*5
     data = cache.get(KEY)
     if not data:
         s = Search(doc_type='authors')[0:MAX_SIZE]
         s = s.sort('title_sort')
         s = s.fields([
             'url_title',
             'title',
             'title_sort',
             'published',
             'modified',
         ])
         response = s.execute()
         data = [
             Author(
                 url_title  = hitvalue(hit, 'url_title'),
                 title      = hitvalue(hit, 'title'),
                 title_sort = hitvalue(hit, 'title_sort'),
                 published  = hitvalue(hit, 'published'),
                 modified   = hitvalue(hit, 'modified'),
             )
             for hit in response
             if hitvalue(hit, 'published')
         ]
         cache.set(KEY, data, TIMEOUT)
     if num_columns:
         return _columnizer(data, num_columns)
     return data

Example #29

0

Show file

File: analyze_network.py Project: ethereum/system-testing

def session_times():
    # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"}

    start_message = 'scenario.p2p_connect.starting.clients.sequentially'
    stop_message = 'scenario.p2p_connect.stopping.clients'
    s = Search(client)
    s = s.filter('bool',
                 should=[F('term', message=start_message),
                         F('term', message=stop_message)])
    s = s.fields(['message', '@timestamp'])
    s = s[0:100000]
    s = s.sort('-@timestamp')  # desc,  we want the latest events
    response = s.execute()

    events = []  # joungest to oldest, last should be a stop message
    for h in response:
        msg = 'start' if h['message'][0] == start_message else 'stop'
        ts = h['@timestamp'][0]
        events.append((msg, ts))
    assert not events or events[0][0] == 'stop'
    sessions = []
    while len(events) >= 2:
        stop = events.pop()
        start = events.pop()
        sessions.append(dict([start, stop]))
    return list(reversed(sessions))

Example #30

0

Show file

File: elastic.py Project: lgb-cyber/AraGWAS

def load_filtered_top_ko_associations_search_after(filters, search_after = '', size=50):
    """Retrieves top associations and filter them through the tickable options"""
    s = Search(using=es, doc_type='ko_associations')
    s = s.sort('-score', '_uid')
    # By default, leave out associations with no gene
    s = s.filter(Q({'nested':{'path':'gene', 'query':{'exists':{'field':'gene.chr'}}}}))

    # # Only need to filter by chromosome, maf or mac
    if 'chr' in filters and len(filters['chr']) > 0 and len(filters['chr']) < 5:
        s = s.filter(Q('bool', should=[Q({'nested':{'path':'gene', 'query':{'match':{'gene.chr':chrom if len(chrom) > 3 else 'chr%s' % chrom}}}}) for chrom in
                                       filters['chr']]))
    if 'significant' in filters:
        s = s.filter(Q('range', mac={'gte': 6}))
        s = s.filter('term', overBonferroni='T') # TODO: change this to permutation once the new indexed scores are in.
    if search_after != '':
        search_after = parse_lastel(search_after)
        print(search_after)
        s = s.extra(search_after=search_after)
    s = s[0:size]
    result = s.execute()
    associations = result['hits']['hits']
    last_el = result['hits']['hits'][-1]['sort']
    # Transformation needed to saveguard url transmition
    last_el[1] = "-".join(last_el[1].split('#'))
    return [association['_source'].to_dict() for association in associations], result['hits']['total'], last_el

Example #31

0

Show file

def get_outbound_refs(
    es_client: Any,
    release_ident: Optional[str] = None,
    work_ident: Optional[str] = None,
    wikipedia_article: Optional[str] = None,
    limit: int = 100,
    offset: Optional[int] = None,
    es_index: str = "fatcat_ref",
) -> RefHits:

    search = Search(using=es_client, index=es_index)

    if release_ident:
        search = search.filter("term", source_release_ident=release_ident)
    elif work_ident:
        search = search.filter("term", source_work_ident=work_ident)
    elif wikipedia_article:
        search = search.filter("term",
                               source_wikipedia_article=wikipedia_article)
    else:
        raise ValueError("require a lookup key")

    search = search.sort("ref_index")

    # re-sort by index
    hits = _execute_ref_query(search, limit=limit, offset=offset)
    hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)
    return hits

Example #32

0

Show file

def get200query(indexbase, my200page, agency, domaintype, org, mimetype, query):
    index = indexbase + '-200scanner'
    s = Search(using=es, index=index)
    s = s.sort('domain')

    if query is None:
        # produce an empty query
        s = s.query(~Q('match_all'))
    else:
        if my200page == 'All Scans':
            s = s.query('simple_query_string', query=query)
        else:
            field = 'data.' + deperiodize(my200page)
            s = s.query('query_string', query=query, fields=[field])

        if agency != 'All Agencies' and agency is not None:
            agencyquery = '"' + agency + '"'
            s = s.query("query_string", query=agencyquery, fields=['agency'])
        if domaintype != 'All Branches' and domaintype is not None:
            domaintypequery = '"' + domaintype + '"'
            s = s.query("query_string", query=domaintypequery, fields=['domaintype'])
        if org != 'All Organizations' and org is not None:
            orgquery = '"' + org + '"'
            s = s.query("query_string", query=orgquery, fields=['organization'])

        # filter with data derived from the pagedata index (if needed)
        pagedatadomains = []
        if mimetype != 'all content_types':
            domains = domainsWith(my200page, 'content_type', mimetype, indexbase + '-pagedata')
            pagedatadomains.extend(domains)
        if len(pagedatadomains) > 0:
            s = s.filter("terms", domain=pagedatadomains)

    return s

Example #33

0

Show file

def get_trade_history(size=10,
                      from_date='2015-10-10',
                      to_date='now',
                      sort_by='-operation_id_num',
                      search_after=None,
                      base="1.3.0",
                      quote="1.3.121"):

    s = Search(using=es, index="bitshares-*")

    s = s.extra(size=size)
    if search_after and search_after != '':
        s = s.extra(search_after=search_after.split(','))

    q = Q()
    q = q & Q("match", operation_type=4)
    q = q & Q("match", operation_history__op_object__is_maker=True)

    q = q & Q("match",
              operation_history__op_object__fill_price__base__asset_id=base)
    q = q & Q("match",
              operation_history__op_object__fill_price__quote__asset_id=quote)

    range_query = Q("range",
                    block_data__block_time={
                        'gte': from_date,
                        'lte': to_date
                    })
    s.query = q & range_query

    s = s.sort(*sort_by.split(','))
    response = s.execute()
    verify_es_response(response)

    return [hit.to_dict() for hit in response]

Example #34

0

Show file

 def build_product_es(self, args, product_search_condition, sources):
     product_es = Search() \
         .query(product_search_condition) \
         .source(['sku'] if args.get('only_sku') else sources) \
         .extra(track_total_hits=True)
     product_es = product_es.sort(*self.sort_condition(args))
     return product_es

Example #35

0

Show file

File: search.py Project: vtalks/vtalks.net

def search_talks(page=None, sort=None):
    """ Get Talks from by Topic from ElasticSearch
    """
    client = Elasticsearch([{
        'host':
        settings.ELASTICSEARCH['default']['HOSTNAME'],
        'port':
        settings.ELASTICSEARCH['default']['PORT'],
    }])

    s = Search(using=client, index="vtalks")

    # Pagination
    if page:
        start = 0
        end = 10
        if page > 1:
            start = settings.PAGE_SIZE * (page - 1)
            end = settings.PAGE_SIZE * page
        s = s[start:end]

    # Sorting
    s = s.sort({sort: {"order": "desc"}})

    # Fields selection
    s = s.source(['id'])

    response = s.execute()

    results_total = response.hits.total
    results_ids = [hit.id for hit in response.hits]

    return results_total, results_ids

Example #36

0

Show file

File: elastic.py Project: densho/encyc-front

 def sources():
     """Returns list of published light Source objects.
     
     @returns: list
     """
     KEY = 'encyc-front:sources'
     TIMEOUT = 60*5
     data = cache.get(KEY)
     if not data:
         s = Search(doc_type='sources')[0:MAX_SIZE]
         s = s.sort('encyclopedia_id')
         s = s.fields([
             'encyclopedia_id',
             'published',
             'modified',
             'headword',
             'media_format',
             'img_path',
         ])
         response = s.execute()
         data = [
             Source(
                 encyclopedia_id = hitvalue(hit, 'encyclopedia_id'),
                 published = hitvalue(hit, 'published'),
                 modified = hitvalue(hit, 'modified'),
                 headword = hitvalue(hit, 'headword'),
                 media_format = hitvalue(hit, 'media_format'),
                 img_path = hitvalue(hit, 'img_path'),
                )
             for hit in response
             if hitvalue(hit, 'published')
         ]
         cache.set(KEY, data, TIMEOUT)
     return data

Example #37

0

Show file

 def get_all(sort_by=None, start=0, limit=10, cve_id=None) -> List[Plugin]:
     search = Search(using=current_app.elasticsearch,
                     index=PLUGINS_INDEX)[start:limit]
     if cve_id:
         search = search.query("term", cvelist__keyword=cve_id)
     if sort_by:
         if sort_by == "score":
             search = search.sort({f"cvss.{sort_by}": {"order": "desc"}})
         else:
             search = search.sort({f"{sort_by}": {"order": "desc"}})
     response = search.execute()
     response = [
         PluginsService._parse_single_result(result)
         for result in response.hits
     ]
     return response

Example #38

0

Show file

File: analyze_network.py Project: konradkonrad/system-testing

def session_times():
    # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"}

    start_message = 'scenario.p2p_connect.starting.clients'
    stop_message = 'scenario.p2p_connect.stopping.clients'
    s = Search(client)
    s = s.filter('bool',
                 should=[
                     F('term', at_message=start_message),
                     F('term', at_message=stop_message)
                 ])
    s = s.fields(['@message', '@timestamp'])
    s = s[0:100000]
    s = s.sort('-@timestamp')  # desc,  we want the latest events
    response = s.execute()

    events = []  # joungest to oldest, last should be a stop message
    for h in response:
        msg = 'start' if h['@message'][0] == start_message else 'stop'
        ts = h['@timestamp'][0]
        events.append((msg, ts))
    assert not events or events[0][0] == 'stop'
    sessions = []
    while len(events) >= 2:
        stop = events.pop()
        start = events.pop()
        sessions.append(dict([start, stop]))
    return list(reversed(sessions))

Example #39

0

Show file

File: analyze_network.py Project: ethereum/system-testing

def fetch(session):
    s = Search(client)
    s = s.filter('bool',
                 should=[F('term', message='p2p.disconnected'),
                         F('term', message='p2p.connected')])
    s = s.filter('range', **{'@timestamp': dict(gte=session['start'], lte=session['stop'])})
    s = s.fields(['json_message.p2p.connected.remote_id', 'guid', 'message', '@timestamp'])
    s = s[0:100000]
    # s = s[0:10]
    s = s.sort('@timestamp')
    response = s.execute()
    return response

Example #40

0

Show file

File: app.py Project: urykhy/stuff

    def search(self, **params):
        limit_cat = params.get('cat', "").strip()
        limit_forum = params.get('forum', "").strip()
        limit_count = int(params.get('count', 100))
        limit_size_min = human2bytes(params.get('min', "0b"))
        limit_size_max = human2bytes(params.get('max', "0b"))
        limit_wild = int(params.get('wild', 0))
        arg = params.get('query', '').strip()
        if not arg:
            arg = "hobbit"

        s = Search(using=es, index=ela_index)
        if limit_size_min:
            s = s.filter("range", size = {'gte' : limit_size_min })
        if limit_size_max:
            s = s.filter("range", size = {'lte' : limit_size_max })

        arg = arg.split(' ')
        if limit_wild:
            q = Q("wildcard", name="*"+arg.pop(0)+"*")
            for a in arg:
                q = q & Q("wildcard", name="*"+a+"*")
        else:
            q = Q("match", name=arg.pop(0))
            for a in arg:
                q = q & Q("match", name=a)

        if len(limit_cat):
            for a in limit_cat.split(' '):
                q = q & Q("match", category=a)
        if len(limit_forum):
            for a in limit_forum.split(' '):
                q = q & Q("match", forum=a)

        s = s.query(q)
        #cherrypy.log("query is "+str(s.to_dict()))
        r = s.execute()
        size = r.hits.total
        #cherrypy.log("query have "+str(size)+" elements")
        if size > limit_count:
            size = limit_count
        s = s.sort('-size')
        s = s.extra(size=size)
        r = s.execute()

        data = []
        for b in r:
            a = [b.id, b.size, b.name, b.category, b.forum, b.date[0] if b.date else '', b.hash]
            data.append(a)

        return {'data': data}

Example #41

0

Show file

File: app.py Project: urykhy/stuff

    def search(self, **params):
        limit_author = params.get('author', "").strip()
        limit_title = params.get('title', "").strip()
        limit_count = int(params.get('count', 10))
        limit_wild = int(params.get('wild', 0))
        q = None

        if not limit_author and not limit_title:
            limit_title = "hobbit"

        s = Search(using=es, index=ela_index)
        arg = limit_title.split(' ')
        arg = [x for x in arg if x]
        if len(arg):
            if limit_wild:
                q = Q("wildcard", title="*"+arg.pop(0)+"*")
                for a in arg:
                    q = q & Q("wildcard", title="*"+a+"*")
            else:
                q = Q("match", title=arg.pop(0))
                for a in arg:
                    q = q & Q("match", title=a)

        arg = limit_author.split(' ')
        arg = [x for x in arg if x]
        if len(arg):
            for a in arg:
                if q:
                    q = q & Q("match", author=a)
                else:
                    q = Q("match", author=a)

        s = s.query(q)
        #cherrypy.log("query is "+str(s.to_dict()))
        r = s.execute()
        size = r.hits.total
        if size > limit_count:
            size = limit_count
        s = s.sort('-date')
        s = s.extra(size=size)
        r = s.execute()
        #cherrypy.log("result is "+str(r))

        data = []
        for b in r:
            a = [b.id, b.author, b.title, b.size, b.date]
            data.append(a)

        return {'data': data}

Example #42

0

Show file

File: models.py Project: densho/encyc-front

 def mapcategories(num_columns=None):
     """Returns list of MapCategory objects.
     
     @returns: list
     """
     s = Search(doc_type='mapcategory')[0:MAX_SIZE]
     s = s.sort('id')
     s = s.fields([
         'id',
         'title',
     ])
     response = s.execute()
     return [
         MapCategory(
             id = hitvalue(hit, 'id'),
             title = hitvalue(hit, 'title'),
         )
         for hit in response
     ]

Example #43

0

Show file

File: oaipmh.py Project: delving/nave

 def convert_filters_to_query(self, filters):
     s = Search(using=self.client)
     spec = filters.get("dataset__spec", None)
     modified_from = filters.get('modified__gt', None)
     modified_until = filters.get('modified__lt', None)
     if spec and not self.spec:
         self.spec = spec
     if self.spec:
         s = s.query("match", **{'system.spec.raw': self.spec})
     if self.query:
         if 'query' in self.query:
             s = s.query(self.query.get('query'))
         if 'filter' in self.query:
             s = s.query(self.query.get('filter'))
     if modified_from:
         s = s.filter("range", **{"system.modified_at": {"gte": modified_from}})
     if modified_until:
         s = s.filter("range", **{"system.modified_at": {"lte": modified_until}})
     s = s.sort({"system.modified_at": {"order": "asc"}})
     return s[self.cursor: self.get_next_cursor()]

Example #44

0

Show file

File: models.py Project: densho/ddr-public

def search(
        hosts, index, query_type='multi_match', query='', filters={},
        sort='m_pseudoid', start=0, pagesize=10
):
    """Constructs Search object
    
    Note: allows any combination of filters, even illogical ones
    
    @param hosts: list settings.DOCSTORE_HOSTS
    @param index: elasticsearch_dsl.Index
    @param query_type: str Name of query type.
    @param query: str Query string.
    @param filters: dict Filters and their arguments.
    @param sort: str Name of field on which to sort.
    @param start: int Start of result set.
    @param pagesize: int Number of records to return.
    @returns: elasticsearch_dsl.Search
    """
    ## remove empty filter args
    #filter_args = {key:val for key,val in filters.items() if val}
    #if not (query or filter_args):
    #    return None,[]
    s = Search(using=ES, index=index)
    s = s.doc_type(Record)
    if filters:
        for field,values in filters.items():
            if values:
                # multiple terms for a field are OR-ed
                s = s.filter('terms', **{field: values})
    if query:
        s = s.query(
            query_type, query=query, fields=definitions.FIELDS_MASTER
        )
    # aggregations
    if filters:
        for field in filters.keys():
            s.aggs.bucket(field, 'terms', field=field, size=1000)
    s = s.fields(definitions.FIELDS_MASTER)
    s = s.sort(sort)
    s = s[start:start+pagesize]
    return s

Example #45

0

Show file

File: models.py Project: joraldaz/goldstone-server

class TopologyData(object):
    """A base class used by models that are really Elasticsearch entries, and
    not db tables."""

    _DOC_TYPE = ""
    _INDEX_PREFIX = ""

    def __init__(self):
        self.conn = es_conn()
        self.search = Search(self.conn)

        # Using the private setters over methods simplifies mocking for
        # unit tests.
        # pylint: disable=W0212
        self.search._doc_type = self._DOC_TYPE
        self.search._index = es_indices(self._INDEX_PREFIX, self.conn)

    @classmethod
    def _sort_arg(cls, key, order):
        """Return key as, key or -key, depending on the sort order."""

        if order in ["+", "asc"]:
            return key              # translates to [{key: {'order': 'asc'}}]
        elif order in ["-", "desc"]:
            return "-" + key        # translates to [{key: {'order': 'desc'}}]
        else:
            raise ValueError("Valid order values are in [+, -, asc, desc]")

    def get(self, count=1, sort_key="@timestamp", sort_order="desc"):
        """Return the latest n instances from ES or None if not found."""
        from elasticsearch import ElasticsearchException

        try:
            self.search.sort(self._sort_arg(sort_key, sort_order))
            self.search = self.search[0:count]

            logger.debug("[get] search = %s", self.search.to_dict())
            # pylint: disable=W0212
            logger.debug("[get] index = %s", self.search._index)
            logger.debug("[get] doc_type = %s", self._DOC_TYPE)

            return self.search.execute()

        except ElasticsearchException as exc:
            logger.debug("get from ES failed, exception was %s", exc.message)
            raise

        except ValueError as exc:
            logger.exception(exc)
            raise

    def post(self, body, **_):
        """Post a record to the database.

        :arg body: record body as JSON object
        :arg _: Unused.
        :return: id of the inserted record

        """

        logger.debug("post called with body = %s", json.dumps(body))

        response = self.conn.create(
            daily_index(self._INDEX_PREFIX),
            self._DOC_TYPE,
            body,
            refresh=True)

        logger.debug('[post] response = %s', json.dumps(response))
        return response['_id']

Example #46

0

Show file

File: es_utils.py Project: macarthur-lab/seqr

class EsSearch(object):

    def __init__(self, families, previous_search_results=None):
        self._client = get_es_client()

        self.samples_by_family_index = defaultdict(lambda: defaultdict(dict))
        for s in get_latest_loaded_samples(families):
            self.samples_by_family_index[s.elasticsearch_index][s.individual.family.guid][s.sample_id] = s

        if len(self.samples_by_family_index) < 1:
            raise InvalidIndexException('No es index found')

        self._set_index_metadata()

        if len(self.samples_by_family_index) != len(self.index_metadata):
            raise InvalidIndexException('Could not find expected indices: {}'.format(
                ', '.join(set(self.samples_by_family_index.keys()) - set(self.index_metadata.keys()))
            ))

        self.previous_search_results = previous_search_results or {}

        self._search = Search()
        self._index_searches = defaultdict(list)
        self._sort = None
        self._allowed_consequences = None

    def _set_index_metadata(self):
        self.index_metadata = get_index_metadata(','.join(self.samples_by_family_index.keys()), self._client)

    def filter(self, new_filter):
        self._search = self._search.filter(new_filter)
        return self

    def sort(self, sort):
        self._sort = _get_sort(sort)
        self._search = self._search.sort(*self._sort)

    def filter_by_annotations(self, annotations, pathogenicity_filter):
        consequences_filter, allowed_consequences = _annotations_filter(annotations)
        if allowed_consequences:
            if pathogenicity_filter:
                consequences_filter |= pathogenicity_filter
            self.filter(consequences_filter)
            self._allowed_consequences = allowed_consequences

    def filter_by_genotype(self, inheritance, quality_filter=None):
        has_previous_compound_hets = self.previous_search_results.get('grouped_results')

        inheritance_mode = (inheritance or {}).get('mode')
        inheritance_filter = (inheritance or {}).get('filter') or {}
        if inheritance_filter.get('genotype'):
            inheritance_mode = None

        quality_filter = dict({'min_ab': 0, 'min_gq': 0}, **(quality_filter or {}))
        if quality_filter['min_ab'] % 5 != 0:
            raise Exception('Invalid ab filter {}'.format(quality_filter['min_ab']))
        if quality_filter['min_gq'] % 5 != 0:
            raise Exception('Invalid gq filter {}'.format(quality_filter['min_gq']))

        if quality_filter and quality_filter.get('vcf_filter') is not None:
            self.filter(~Q('exists', field='filters'))

        for index, family_samples_by_id in self.samples_by_family_index.items():
            if not inheritance and not quality_filter['min_ab'] and not quality_filter['min_gq']:
                search_sample_count = sum(len(samples) for samples in family_samples_by_id.values())
                index_sample_count = Sample.objects.filter(elasticsearch_index=index).count()
                if search_sample_count == index_sample_count:
                    # If searching across all families in an index with no inheritance mode we do not need to explicitly
                    # filter on inheritance, as all variants have some inheritance for at least one family
                    continue

            genotypes_q = _genotype_inheritance_filter(
                inheritance_mode, inheritance_filter, family_samples_by_id, quality_filter,
            )

            compound_het_q = None
            if inheritance_mode == COMPOUND_HET:
                compound_het_q = genotypes_q
            else:
                self._index_searches[index].append(self._search.filter(genotypes_q))

            if inheritance_mode == RECESSIVE:
                compound_het_q = _genotype_inheritance_filter(
                    COMPOUND_HET, inheritance_filter, family_samples_by_id, quality_filter,
                )

            if compound_het_q and not has_previous_compound_hets:
                compound_het_search = self._search.filter(compound_het_q)
                compound_het_search.aggs.bucket(
                    'genes', 'terms', field='geneIds', min_doc_count=2, size=MAX_COMPOUND_HET_GENES+1
                ).metric(
                    'vars_by_gene', 'top_hits', size=100, sort=self._sort, _source=QUERY_FIELD_NAMES
                )
                self._index_searches[index].append(compound_het_search)

    def search(self, page=1, num_results=100):
        indices = self.samples_by_family_index.keys()

        logger.info('Searching in elasticsearch indices: {}'.format(', '.join(indices)))

        num_loaded = len(self.previous_search_results.get('all_results', []))
        if len(indices) == 1 \
                and len(self._index_searches.get(indices[0], [])) <= 1 \
                and not self.previous_search_results.get('grouped_results'):
            start_index = None
            if (page - 1) * num_results < num_loaded:
                start_index = num_loaded
            return self._execute_single_search(page, num_results, start_index=start_index)
        elif not self._index_searches:
            # If doing all project-families all inheritance search, do it as a single query
            # Load all variants, do not skip pages
            num_loaded += self.previous_search_results.get('duplicate_doc_count', 0)
            if num_loaded >= (page-1)*num_results:
                start_index = num_loaded
            else:
                start_index = 0
            return self._execute_single_search(
                page, num_results, start_index=start_index, deduplicate=True
            )
        else:
            return self._execute_multi_search(page, num_results)

    def _execute_single_search(self, page, num_results, deduplicate=False, start_index=None):
        index_name = ','.join(self.samples_by_family_index.keys())
        search = self._get_paginated_searches(
            index_name, page, num_results*len(self.samples_by_family_index), start_index=start_index
        )[0]

        response = self._execute_search(search)
        variant_results, total_results, is_compound_het = self._parse_response(response)
        self.previous_search_results['total_results'] = total_results

        results_start_index = (page - 1) * num_results
        if is_compound_het:
            variant_results = _sort_compound_hets(variant_results)
            self.previous_search_results['grouped_results'] = variant_results
            end_index = min(results_start_index + num_results, total_results)
            return _get_compound_het_page(variant_results, results_start_index, end_index)

        if deduplicate:
            variant_results = self._deduplicate_results(variant_results)

        # Only save contiguous pages of results:
        previous_all_results = self.previous_search_results.get('all_results', [])
        if len(previous_all_results) >= results_start_index:
            self.previous_search_results['all_results'] = self.previous_search_results.get('all_results', []) + variant_results
            variant_results = self.previous_search_results['all_results'][results_start_index:]

        return variant_results[:num_results]

    def _execute_multi_search(self, page, num_results):
        indices = self.samples_by_family_index.keys()

        if not self.previous_search_results.get('loaded_variant_counts'):
            self.previous_search_results['loaded_variant_counts'] = {}

        ms = MultiSearch()
        for index_name in indices:
            start_index = 0
            if self.previous_search_results['loaded_variant_counts'].get(index_name):
                index_total = self.previous_search_results['loaded_variant_counts'][index_name]['total']
                start_index = self.previous_search_results['loaded_variant_counts'][index_name]['loaded']
                if start_index >= index_total:
                    continue
            else:
                self.previous_search_results['loaded_variant_counts'][index_name] = {'loaded': 0, 'total': 0}

            searches = self._get_paginated_searches(index_name, page, num_results, start_index=start_index)
            ms = ms.index(index_name)
            for search in searches:
                ms = ms.add(search)

        responses = self._execute_search(ms)

        new_results = []
        compound_het_results = self.previous_search_results.get('compound_het_results', [])
        for response in responses:
            response_hits, response_total, is_compound_het = self._parse_response(response)
            if not response_total:
                continue

            index_name = response.hits[0].meta.index
            if is_compound_het:
                compound_het_results += response_hits
                self.previous_search_results['loaded_variant_counts']['{}_compound_het'.format(index_name)] = {'total': response_total}
            else:
                new_results += response_hits
                self.previous_search_results['loaded_variant_counts'][index_name]['total'] = response_total
                self.previous_search_results['loaded_variant_counts'][index_name]['loaded'] += len(response_hits)

        self.previous_search_results['total_results'] = sum(counts['total'] for counts in self.previous_search_results['loaded_variant_counts'].values())

        # combine new results with unsorted previously loaded results to correctly sort/paginate
        all_loaded_results = self.previous_search_results.get('all_results', [])
        previous_page_record_count = (page - 1) * num_results
        if len(all_loaded_results) >= previous_page_record_count:
            loaded_results = all_loaded_results[:previous_page_record_count]
            new_results += all_loaded_results[previous_page_record_count:]
        else:
            loaded_results = []
            new_results += self.previous_search_results.get('variant_results', [])

        new_results = sorted(new_results, key=lambda variant: variant['_sort'])
        variant_results = self._deduplicate_results(new_results)

        if compound_het_results or self.previous_search_results.get('grouped_results'):
            if compound_het_results:
                compound_het_results = self._deduplicate_compound_het_results(compound_het_results)
            return self._process_compound_hets(compound_het_results, variant_results, num_results)
        else:
            self.previous_search_results['all_results'] = loaded_results + variant_results
            return variant_results[:num_results]

    def _get_paginated_searches(self, index_name, page, num_results, start_index=None):
        searches = []
        for search in self._index_searches.get(index_name, [self._search]):
            search = search.index(index_name)

            if search.aggs.to_dict():
                # For compound het search get results from aggregation instead of top level hits
                search = search[:1]
                logger.info('Loading compound hets for {}'.format(index_name))
            else:
                end_index = page * num_results
                if start_index is None:
                    start_index = end_index - num_results

                search = search[start_index:end_index]
                search = search.source(QUERY_FIELD_NAMES)
                logger.info('Loading {} records {}-{}'.format(index_name, start_index, end_index))

            searches.append(search)
        return searches

    def _execute_search(self, search):
        logger.debug(json.dumps(search.to_dict(), indent=2))
        try:
            return search.using(self._client).execute()
        except elasticsearch.exceptions.ConnectionTimeout as e:
            canceled = self._delete_long_running_tasks()
            logger.error('ES Query Timeout. Canceled {} long running searches'.format(canceled))
            raise e

    def _parse_response(self, response):
        if hasattr(response.aggregations, 'genes') and response.hits:
            response_hits, response_total = self._parse_compound_het_response(response)
            return response_hits, response_total, True

        response_total = response.hits.total
        logger.info('Total hits: {} ({} seconds)'.format(response_total, response.took / 1000.0))

        return [self._parse_hit(hit) for hit in response], response_total, False

    def _parse_compound_het_response(self, response):
        if len(response.aggregations.genes.buckets) > MAX_COMPOUND_HET_GENES:
            raise Exception('This search returned too many compound heterozygous variants. Please add stricter filters')

        index_name = response.hits[0].meta.index

        family_unaffected_individual_guids = {
            family_guid: {sample.individual.guid for sample in samples_by_id.values() if
                          sample.individual.affected == UNAFFECTED}
            for family_guid, samples_by_id in self.samples_by_family_index[index_name].items()
        }

        variants_by_gene = {}
        for gene_agg in response.aggregations.genes.buckets:
            gene_variants = [self._parse_hit(hit) for hit in gene_agg['vars_by_gene']]
            gene_id = gene_agg['key']

            if gene_id in variants_by_gene:
                continue

            if self._allowed_consequences:
                # Variants are returned if any transcripts have the filtered consequence, but to be compound het
                # the filtered consequence needs to be present in at least one transcript in the gene of interest
                gene_variants = [variant for variant in gene_variants if any(
                    transcript['majorConsequence'] in self._allowed_consequences for transcript in
                    variant['transcripts'][gene_id]
                )]
            if len(gene_variants) < 2:
                continue

            # Do not include groups multiple times if identical variants are in the same multiple genes
            if any(variant['mainTranscript']['geneId'] != gene_id for variant in gene_variants):
                primary_genes = [variant['mainTranscript']['geneId'] for variant in gene_variants]
                if all(gene == primary_genes[0] for gene in primary_genes):
                    is_valid_gene = True
                    if self._allowed_consequences:
                        is_valid_gene = all(any(
                            transcript['majorConsequence'] in self._allowed_consequences for transcript in
                            variant['transcripts'][primary_genes[0]]
                        ) for variant in gene_variants)
                    if is_valid_gene:
                        gene_id = primary_genes[0]
                        if gene_id in variants_by_gene:
                            continue
                else:
                    variant_ids = [variant['variantId'] for variant in gene_variants]
                    for gene in set(primary_genes):
                        if variant_ids == [variant['variantId'] for variant in variants_by_gene.get(gene, [])]:
                            continue

            family_guids = set(gene_variants[0]['familyGuids'])
            for variant in gene_variants[1:]:
                family_guids = family_guids.intersection(set(variant['familyGuids']))

            invalid_family_guids = set()
            for family_guid in family_guids:
                for individual_guid in family_unaffected_individual_guids[family_guid]:
                    # To be compound het all unaffected individuals need to be hom ref for at least one of the variants
                    is_family_compound_het = any(
                        variant['genotypes'].get(individual_guid, {}).get('numAlt') != 1 for variant in
                        gene_variants)
                    if not is_family_compound_het:
                        invalid_family_guids.add(family_guid)
                        break

            family_guids -= invalid_family_guids
            if not family_guids:
                continue

            for variant in gene_variants:
                variant['familyGuids'] = list(family_guids)

            variants_by_gene[gene_id] = gene_variants

        total_compound_het_results = sum(len(variants) for variants in variants_by_gene.values())
        logger.info('Total compound het hits: {}'.format(total_compound_het_results))

        return [{k: v} for k, v in variants_by_gene.items()], total_compound_het_results

    def _parse_hit(self, raw_hit):
        hit = {k: raw_hit[k] for k in QUERY_FIELD_NAMES if k in raw_hit}
        index_name = raw_hit.meta.index
        index_family_samples = self.samples_by_family_index[index_name]

        if hasattr(raw_hit.meta, 'matched_queries'):
            family_guids = list(raw_hit.meta.matched_queries)
        else:
            # Searches for all inheritance and all families do not filter on inheritance so there are no matched_queries
            alt_allele_samples = set()
            for alt_samples_field in HAS_ALT_FIELD_KEYS:
                alt_allele_samples.update(hit[alt_samples_field])
            family_guids = [family_guid for family_guid, samples_by_id in index_family_samples.items()
                            if any(sample_id in alt_allele_samples for sample_id in samples_by_id.keys())]

        genotypes = {}
        for family_guid in family_guids:
            samples_by_id = index_family_samples[family_guid]
            genotypes.update({
                samples_by_id[genotype_hit['sample_id']].individual.guid: _get_field_values(genotype_hit, GENOTYPE_FIELDS_CONFIG)
                for genotype_hit in hit[GENOTYPES_FIELD_KEY] if genotype_hit['sample_id'] in samples_by_id
            })

        genome_version = self.index_metadata[index_name].get('genomeVersion')
        lifted_over_genome_version = None
        lifted_over_chrom = None
        lifted_over_pos = None
        liftover_grch38_to_grch37 = _liftover_grch38_to_grch37()
        if liftover_grch38_to_grch37 and genome_version == GENOME_VERSION_GRCh38:
            if liftover_grch38_to_grch37:
                grch37_coord = liftover_grch38_to_grch37.convert_coordinate(
                    'chr{}'.format(hit['contig'].lstrip('chr')), int(hit['start'])
                )
                if grch37_coord and grch37_coord[0]:
                    lifted_over_genome_version = GENOME_VERSION_GRCh37
                    lifted_over_chrom = grch37_coord[0][0].lstrip('chr')
                    lifted_over_pos = grch37_coord[0][1]

        populations = {
            population: _get_field_values(
                hit, POPULATION_RESPONSE_FIELD_CONFIGS, format_response_key=lambda key: key.lower(),
                lookup_field_prefix=population,
                existing_fields=self.index_metadata[index_name]['fields'],
                get_addl_fields=lambda field, field_config:
                [pop_config.get(field)] + ['{}_{}'.format(population, custom_field) for custom_field in
                                           field_config.get('fields', [])],
            )
            for population, pop_config in POPULATIONS.items()
        }

        sorted_transcripts = [
            {_to_camel_case(k): v for k, v in transcript.to_dict().items()}
            for transcript in hit[SORTED_TRANSCRIPTS_FIELD_KEY] or []
        ]
        transcripts = defaultdict(list)
        for transcript in sorted_transcripts:
            transcripts[transcript['geneId']].append(transcript)

        result = _get_field_values(hit, CORE_FIELDS_CONFIG, format_response_key=str)
        result.update({
            field_name: _get_field_values(hit, fields, lookup_field_prefix=field_name)
            for field_name, fields in NESTED_FIELDS.items()
        })
        if hasattr(raw_hit.meta, 'sort'):
            result['_sort'] = [_parse_es_sort(sort, self._sort[i]) for i, sort in enumerate(raw_hit.meta.sort)]

        result.update({
            'familyGuids': sorted(family_guids),
            'genotypes': genotypes,
            'genomeVersion': genome_version,
            'liftedOverGenomeVersion': lifted_over_genome_version,
            'liftedOverChrom': lifted_over_chrom,
            'liftedOverPos': lifted_over_pos,
            'mainTranscript': sorted_transcripts[0] if len(sorted_transcripts) else {},
            'populations': populations,
            'predictions': _get_field_values(
                hit, PREDICTION_FIELDS_CONFIG, format_response_key=lambda key: key.split('_')[1].lower()
            ),
            'transcripts': transcripts,
        })
        return result

    def _deduplicate_results(self, sorted_new_results):
        duplicates = self.previous_search_results.get('duplicate_doc_count', 0)
        variant_results = []
        for variant in sorted_new_results:
            if variant_results and variant_results[-1]['variantId'] == variant['variantId']:
                variant_results[-1]['genotypes'].update(variant['genotypes'])
                variant_results[-1]['familyGuids'] = sorted(set(variant_results[-1]['familyGuids'] + variant['familyGuids']))
                duplicates += 1
            else:
                variant_results.append(variant)

        self.previous_search_results['duplicate_doc_count'] = duplicates

        self.previous_search_results['total_results'] -= duplicates

        return variant_results

    def _deduplicate_compound_het_results(self, compound_het_results):
        duplicates = 0
        results = {}
        for variant_group in compound_het_results:
            gene = variant_group.keys()[0]
            variants = variant_group[gene]
            if gene in results:
                for variant in variants:
                    existing_index = next(
                        (i for i, existing in enumerate(results[gene]) if existing['variantId'] == variant['variantId']), None,
                    )
                    if existing_index is not None:
                        results[gene][existing_index]['genotypes'].update(variant['genotypes'])
                        results[gene][existing_index]['familyGuids'] = sorted(
                            results[gene][existing_index]['familyGuids'] + variant['familyGuids']
                        )
                        duplicates += 1
                    else:
                        results[gene].append(variant)
            else:
                results[gene] = variants

        self.previous_search_results['duplicate_doc_count'] = duplicates + self.previous_search_results.get('duplicate_doc_count', 0)

        self.previous_search_results['total_results'] -= duplicates

        return [{k: v} for k, v in results.items()]

    def _process_compound_hets(self, compound_het_results, variant_results, num_results):
        if not self.previous_search_results.get('grouped_results'):
            self.previous_search_results['grouped_results'] = []

        # Sort merged result sets
        grouped_variants = [{None: [var]} for var in variant_results]
        grouped_variants = compound_het_results + grouped_variants
        grouped_variants = _sort_compound_hets(grouped_variants)

        loaded_result_count = sum(len(vars.values()[0]) for vars in grouped_variants + self.previous_search_results['grouped_results'])

        # Get requested page of variants
        flattened_variant_results = []
        num_compound_hets = 0
        num_single_variants = 0
        for variants_group in grouped_variants:
            variants = variants_group.values()[0]
            flattened_variant_results += variants
            if loaded_result_count != self.previous_search_results['total_results']:
                self.previous_search_results['grouped_results'].append(variants_group)
            if len(variants) > 1:
                num_compound_hets += 1
            else:
                num_single_variants += 1
            if len(flattened_variant_results) >= num_results:
                break

        # Only save non-returned results separately if have not loaded all results
        if loaded_result_count == self.previous_search_results['total_results']:
            self.previous_search_results['grouped_results'] += grouped_variants
            self.previous_search_results['compound_het_results'] = []
            self.previous_search_results['variant_results'] = []
        else:
            self.previous_search_results['compound_het_results'] = compound_het_results[num_compound_hets:]
            self.previous_search_results['variant_results'] = variant_results[num_single_variants:]

        return flattened_variant_results

    def _delete_long_running_tasks(self):
        search_tasks = self._client.tasks.list(actions='*search', group_by='parents')
        canceled = 0
        for parent_id, task in search_tasks['tasks'].items():
            if task['running_time_in_nanos'] > 10 ** 11:
                canceled += 1
                self._client.tasks.cancel(parent_task_id=parent_id)
        return canceled

Example #47

0

Show file

File: search.py Project: atiqueahmedziad/addons-server

    def _build_query(self):
        query = Q()

        source = ['id']
        sort = []

        aggregations = {}
        query_string = None
        as_list = as_dict = False

        for action, value in self.steps:
            if action == 'order_by':
                for key in value:
                    if key.startswith('-'):
                        sort.append({key[1:]: 'desc'})
                    else:
                        sort.append(key)
            elif action == 'values':
                source.extend(value)
                as_list, as_dict = True, False
            elif action == 'values_dict':
                if value:
                    source.extend(value)
                as_list, as_dict = False, True
            elif action == 'query':
                query &= self._process_queries(value)
            elif action == 'filter':
                query &= self._process_filters(value)
            elif action == 'source':
                source.extend(value)
            elif action == 'aggregate':
                aggregations.update(value)
            elif action == 'filter_query_string':
                query_string = value
            else:
                raise NotImplementedError(action)

        # If we have a raw query string we are going to apply all sorts
        # of boosts and filters to improve relevance scoring.
        #
        # We are using the same rules that `search.filters:SearchQueryFilter`
        # implements to have a single-source of truth for how our
        # scoring works.
        from olympia.search.filters import SearchQueryFilter

        search = Search().query(query)

        if query_string:
            search = SearchQueryFilter().apply_search_query(
                query_string, search)

        if sort:
            search = search.sort(*sort)

        if source:
            search = search.source(source)

        body = search.to_dict()

        # These are manually added for now to simplify a partial port to
        # elasticsearch-dsl
        if self.start:
            body['from'] = self.start
        if self.stop is not None:
            body['size'] = self.stop - self.start
        if aggregations:
            body['aggs'] = aggregations

        self.source, self.as_list, self.as_dict = source, as_list, as_dict
        return body

Example #48

0

Show file

File: supersearch.py Project: abudulemusa/socorro

    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't restrict on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot return it' % value
                    )

                if not field_['is_returned']:
                    # Returning this field is not allowed.
                    raise BadArgumentError(
                        value,
                        msg='Field "%s" is not allowed to be returned' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't sort on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot sort on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=self.config.facets_max_number
                )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break

Example #49

0

Show file

File: supersearch.py Project: Krispy2009/socorro

    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=(
                                    '_results_number cannot be greater '
                                    'than 1,000'
                                )
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative'
                            )
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000'
                            )

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(
                params,
                search,
                facets_size,
                histogram_intervals
            )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error
                    )[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise

Example #50

0

Show file

File: pue_last.py Project: ThomasADavis/catalog

def calc(result):
   #xx = datetime.datetime.utcnow()
   #print 'x: ', xx
   #result['level1']['start'] = datetime.datetime.now().strftime("%B %d %Y, %X")
   #result['level1']['start'] = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")

   pue = dict()
   # Constants

   #pue['N1'] = 1000 / 1000
   #pue['N2'] = 1000 / 1000
   #pue['N3'] = 710 / 1000
   #pue['N4'] = 1700 / 1000
   ##pue['N6'] = 0
   #pue['N8'] = 500 / 1000
   #pue['N9'] = 1600 / 1000

   pue['N1'] = 1000
   pue['N2'] = 1000
   pue['N3'] = 710
   pue['N4'] = 1700
   ##pue['N6'] = 0
   pue['N8'] = 500
   pue['N9'] = 1600

   result['level1']['start'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
   result['level2']['start'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
   for x in i:
      indx = x + '*'
      ##print 'index', indx 
      for eskey in i[x]:
         ##timespan = i[x][eskey]
         ##print 'key', eskey 
         ##print 'value', i[x][eskey]
         (valueField, scale, variable, source) = i[x][eskey].split('|')
         if variable not in pue:
            pue[variable] = 0
            #print("clears pue")
         k = eskey.split('|')
         s = Search(using=esdb, index=indx)
         for j in k:
            (subkey, subvalue) = j.split(':')
            s = s.query("term", **{subkey: subvalue})
            ##print 'subkey', subkey
            ##print 'subvalue', subvalue
         ##s = s.query('range', **{'@timestamp':{'gte': '2018-07-01T00:00:00.000Z', 'lt':'2018-08-01T00:00:00.000Z'}})
         s = s.query('range', **{'@timestamp':{'gte': 'now-30m', 'lt':'now'}})
         s = s.sort('-@timestamp')
         #s = s.aggs.metric('power_sum', 'sum', field=valueField)
         s = s[0:1]

         #print s.to_dict() 
         response = s.execute() 

         #print 'Total %d hits found.' % response.hits.total 
         if response.hits.total != 0:
            for commit in response:
         #      print commit.to_dict()
               pue[variable] += commit['data']['datum'] * float(scale)
         #      ##print commit.to_dict()
         #      for n in k:
         #         (sk, sv) = n.split(':')
         #         if sk.find('.') != -1:
         #            (psk, ssk) = sk.split('.')
         #            ##print 'key: ', psk 
         #            ##print 'ha', commit[psk][ssk] 
         #         ##else:
         #            ##print 'key: ', sk 
         #            ##print 'value: ', sv 
         #            ##print 'ha', commit[sk] 
         #      v = response.aggregations.power_sum
         #      pue[variable] += ( v['value'] / response.hits.total )
         #      print("Processing %s" % variable)
         else:
            ##print s.to_dict() 
            if result['level1'].has_key('missing') is False:
               result['level1']['missing'] = [variable]
               result['level2']['missing'] = [variable]
            else:
               result['level1']['missing'].append(variable)
               result['level2']['missing'].append(variable)

            if result['level2'].has_key('missing-meters') is False:
               result['level1']['missing-meters'] = [source]
               result['level2']['missing-meters'] = [source]
            else:
               result['level1']['missing-meters'].append(source)
               result['level2']['missing-meters'].append(source)
            #print 'No Value for: ', variable, ' ', source

   pue['N7'] = pue['N7p'] - pue['N7pp']
   pue['N10pp'] = pue['N10p'] - pue['N10']
   pue['D'] = pue['D1'] + pue['D2']
   pue['E'] = pue['E1'] + pue['E2']
   pue['F'] = pue['F1'] + pue['F2']
   if (pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2']) == 0:
      lineLoss = 0
   else:
      lineLoss = (pue['A1'] + pue['A2']) / (pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2'])
   if pue['Bp'] == 0:
      txLoss590 = 0
   else:
      txLoss590 = (pue['B1'] + pue['B2']) / pue['Bp']
   if pue['Cp'] == 0:
      txLoss596 = 0
   else:
      txLoss596 = (pue['C1'] + pue['C2']) / pue['Cp']

   #numm1 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 ) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss
   #demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp']

   numm1 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss
   demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp']

   numm2 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N7'] + pue['N6'] + pue['N8'] + pue['N9'] + pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue['ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue['ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue['ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue['ND2-16'] + pue['ND2-17'] + pue['ND2-18'] ) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss

   demon2 = pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue['ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue['ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue['ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue['ND2-16'] + pue['ND2-17'] + pue['ND2-18'] + pue['Dp'] + pue['Ep'] + pue['Fp']

   if demon1 == 0:
      p1 = 0
   else:
      p1 = numm1 / demon1
   if demon2 == 0:
      p2 = 0
   else:
      p2 = numm2 / demon2

   result['level1']['pue'] = p1
   result['level2']['pue'] = p2
   result['level1']['end'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
   result['level2']['end'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")

Example #51

0

Show file

File: supersearch.py Project: gitter-badger/socorro

    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    if value.startswith('_histogram.'):
                        # This is a histogram aggregation we want to run,
                        # not a terms aggregation.
                        field_name = value[len('_histogram.'):]
                        if field_name not in self.histogram_fields:
                            continue

                        histogram_type = (
                            self.all_fields[field_name]['query_type'] == 'date'
                            and 'date_histogram' or 'histogram'
                        )
                        sig_bucket.bucket(
                            'histogram_%s' % field_name,
                            histogram_type,
                            field=self.get_field_name(field_name),
                            interval=histogram_intervals[field_name],
                        )
                    else:
                        sig_bucket.bucket(
                            value,
                            'terms',
                            field=self.get_field_name(value),
                            size=facets_size,
                        )

            search.aggs.bucket('signature', sig_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            if params.get('_histogram.%s' % f):
                histogram_type = (
                    self.all_fields[f]['query_type'] == 'date'
                    and 'date_histogram' or 'histogram'
                )
                date_bucket = A(
                    histogram_type,
                    field=self.get_field_name(f),
                    interval=histogram_intervals[f],
                )
                for param in params['_histogram.%s' % f]:
                    for value in param.value:
                        if not value:
                            continue

                        field_name = self.get_field_name(value)
                        val_bucket = A(
                            'terms',
                            field=field_name,
                            size=facets_size,
                        )
                        date_bucket.bucket(value, val_bucket)

                search.aggs.bucket('histogram_%s' % f, date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break

Example #52

0

Show file

File: utils.py Project: olabi/lore

def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None):
    """
    Perform a search in Elasticsearch.

    Args:
        tokens (unicode): string of one or more words
        repo_slug (unicode): repository slug
        sort_by (string): field to sort by
        terms: (dict): {"vocabulary name": ["term1" [, "term2"]]}
    Returns:
        results (SearchResults)
    """
    if terms is None:
        terms = {}

    search = Search(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Limit returned fields since content_xml can be huge and is unnecessary.
    search = search.fields(_get_field_names())

    if tokens is not None:
        # Search on title, description, and content_xml (minus markup).
        multi = query.MultiMatch(
            query=tokens, fields=["title", "description", "content_stripped"])
        search = search.query(multi)

    # Filter further on taxonomy terms.
    for key, value in terms.items():
        if value is None:
            search = search.query(
                "query_string",
                query="_missing_:({key})".format(key=key)
            )
        else:
            search = search.query("match", **{key: value})

    if repo_slug is not None:
        # Filter further on repository.
        search = search.query("match", repository=repo_slug)
    if sort_by is None:
        # Always sort by ID to preserve ordering.
        search = search.sort("id")
    else:
        # Temporary workaround; the values in sorting.py should be updated,
        # but for now Haystack is still using them. Also, the hyphen is
        # required because we sort the numeric values high to low.
        if "title" not in sort_by:
            reverse = sort_by.startswith("-")
            if reverse:
                sort_by = sort_by[1:]
            if "xa" not in sort_by:
                sort_by = "xa_{0}".format(sort_by)
            if reverse:
                sort_by = "-{0}".format(sort_by)
        # Always sort by ID to preserve ordering.
        search = search.sort(sort_by, "id")

    vocab_ids = set(get_vocab_ids(repo_slug=repo_slug))
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        search.aggs.bucket(
            "{key}_missing".format(key=vocab_key),
            "missing", field=vocab_key
        )
        search.aggs.bucket(
            "{key}_buckets".format(key=vocab_key),
            "terms", field=vocab_key
        )
    for key in ('run', 'course', 'resource_type'):
        search.aggs.bucket(
            '{key}_builtins'.format(key=key), "terms", field=key
        )

    return SearchResults(search)