def search(query, order=None, maxPrice=None, minPrice=None, sortBy=None, page=None): s = Search(using=es, index="shopee") q = Q("multi_match", query=query, fields=['item_name', 'item_name.folded']) s = s.query(q) if sortBy == 'ctime': s = s.sort({"item_ctime": {"order": "desc"}}) elif sortBy == 'sales': s = s.sort({"item_sold": {"order": "desc"}}) elif sortBy == 'price' and order != "desc": s = s.sort({"item_price": {"order": "asc"}}) elif sortBy == 'price' and order == "desc": s = s.sort({"item_price": {"order": "desc"}}) item_price = {} if minPrice: minPrice = int(minPrice) * 100000 item_price['gte'] = str(minPrice) if maxPrice: maxPrice = int(maxPrice) * 100000 item_price['lte'] = maxPrice if item_price: print(item_price) s = s.filter('range', item_price=item_price) page = int(page) if page else 0 s = s[20 * page:20 * (page + 1)] response = s.execute() results = response['hits']['hits'] return results
def fetch_events(self, query): search = Search(**self.get_search_kwargs()) search = search.query(query) search.sort('-@timestamp') response = search.execute() results = get_response_dict(response) return results[0]
def es_get_accounts(main_chain='eos', user_id=None, start_vol=None, end_vol=None, from_idx=0, size=100, order='totalEos', fields=None): index = '{}_account'.format(main_chain) if not fields: fields = [ 'id', 'timestamp', 'updateTimestamp', 'userId', 'totalEos', 'liquidEos', 'stackedEos', 'unstackingEos' ] if user_id: s = Search(using=es_client, index=index, doc_type='doc') \ .filter('term', userId=user_id) elif start_vol and end_vol: range = {order: {'gte': start_vol, 'lt': end_vol}} s = Search(using=es_client, index=index, doc_type='doc') \ .source(include=fields) \ .filter('range', **range) s = s.sort({order: {"order": "desc"}}) else: s = Search(using=es_client, index=index, doc_type='doc').source(include=fields) s = s.sort({order: {"order": "desc"}}) resp = s[from_idx:from_idx + size].execute() return es_resp_to_payload(resp)
def getUSWDSquery(indexbase, query, version, agency, domaintype, sort): index = indexbase + '-uswds2' try: query = int(query) except: query = 0 s = Search(using=es, index=index) if sort == 'Score': s = s.sort('-data.total_score') else: s = s.sort('domain') s = s.query(Bool(should=[Range(data__total_score={'gte': query})])) if version != 'all versions': if version == 'detected versions': s = s.query("query_string", query='v*', fields=['data.uswdsversion']) else: versionquery = '"' + version + '"' s = s.query("query_string", query=versionquery, fields=['data.uswdsversion']) if agency != 'All Agencies': agencyquery = '"' + agency + '"' s = s.query("query_string", query=agencyquery, fields=['agency']) if domaintype != 'All Branches': domaintypequery = '"' + domaintype + '"' s = s.query("query_string", query=domaintypequery, fields=['domaintype']) return s
def get_all_ids(index=None, id_field='recid', last_updated=None, latest_first=False): """Get all record or inspire ids of publications in the search index :param index: name of index to use. :param id_field: elasticsearch field to return. Should be 'recid' or 'inspire_id' :return: list of integer ids """ if id_field not in ('recid', 'inspire_id'): raise ValueError('Invalid ID field %s' % id_field) search = Search(using=es, index=index) \ .filter("term", doc_type=CFG_PUB_TYPE) \ .source(fields=[id_field]) if last_updated: search = search.filter("range", **{'last_updated': {'gte': last_updated.isoformat()}}) if latest_first: search = search.sort({'last_updated' : {'order' : 'desc'}}) else: search = search.sort('recid') search = search.params(preserve_order=True) return [int(h[id_field]) for h in search.scan()]
def post(self): json = request.get_json() s = Search(using=es,index='twitter',doc_type='items') username = json.pop('username') if 'username' in json else None following = True if 'following' in json: following = json.pop('following') else: if username: following = True timestamp = json.pop('timestamp') if 'timestamp' in json else time() search = 'q' in json limit = json.pop('limit') if 'limit' in json and json['limit'] <= 100 else 50 following_list = db.user.find_one({'username':session['username']})['following'] # do we only need this is following=true? #query = {'timestamp':{'$lte':timestamp}} s = s.filter('range', timestamp={'lte':timestamp}) if search: #query['$text'] = {'$search':json['q']} s = s.query('match', content=json['q']) if username: if following: query['username'] = username if username in following_list else '' s = s.filter('term',username=query['username']) else: #query['username'] = username s =s.filter('term',username=username) else: if following: #query['username'] = {'$in': following_list} s = s.filter('terms',username=following_list) # my code if 'parent' in json: #query['parent'] = json['parent'] s = s.filter('term', parent=json['parent']) if 'replies' not in json: json['replies'] = True if not json['replies']: query['parent'] = None s = s.filter('term', parent=None) # endmy code if 'rank' not in json: json['rank'] = 'interest' s =s[0:limit] if json['rank'] == 'time': sort_key = 'timestamp' s = s.sort('-timestamp') else: sort_key = 'interest_score' s = s.sort('-interest_score') #sort_dir = -1 #results = db.items.find(query).sort(sort_key, sort_dir).limit(limit) #results = db.items.find(filter=query, limit=limit, sort=sort_by) #results = db.items.aggregate([{'$match':query}, {'$limit': limit}, {'$sort': sort_by}]) results = s.execute() l = [x['_source'].to_dict() for x in results['hits']['hits']] return Response(response = dumps({'status':'OK','items':l}),mimetype='application/json')
def get_all_articles(self): ''' Get all articles from the index. ''' s = Search(index="article") s = s.query("match_all") s.sort('created', {'order': 'desc'}) #sort in descending order response = self.get_response(s) return response
def _search(self, query): s = Search(using=self.Client, index="winlogbeat-*").query(query) if self.DTRange != None: s = s.filter('range', **self.DTRange) s.source(includes=['winlog.*']) s.sort('-winlog.event_data.UtcTime') if self.Scan: return s.scan() else: return s.execute().hits
def _apply_paging(self, catalog: CatalogName, es_search: Search, pagination: Mapping[str, Any]): """ Applies the pagination to the ES Search object :param catalog: The name of the catalog to query :param es_search: The ES Search object :param pagination: Dictionary with raw entries from the GET Request. It has: 'size', 'sort', 'order', and one of 'search_after', 'search_before', or 'from'. :return: An ES Search object where pagination has been applied """ # Extract the fields for readability (and slight manipulation) _sort = pagination['sort'] + '.keyword' _order = pagination['order'] field_type = self.field_type(catalog, tuple(pagination['sort'].split('.'))) _mode = field_type.es_sort_mode def sort_values(sort_field, sort_order, sort_mode): assert sort_order in ('asc', 'desc'), sort_order return ( { sort_field: { 'order': sort_order, 'mode': sort_mode, 'missing': '_last' if sort_order == 'asc' else '_first' } }, { '_uid': { 'order': sort_order } } ) # Using search_after/search_before pagination if 'search_after' in pagination: es_search = es_search.extra(search_after=pagination['search_after']) es_search = es_search.sort(*sort_values(_sort, _order, _mode)) elif 'search_before' in pagination: es_search = es_search.extra(search_after=pagination['search_before']) rev_order = 'asc' if _order == 'desc' else 'desc' es_search = es_search.sort(*sort_values(_sort, rev_order, _mode)) else: es_search = es_search.sort(*sort_values(_sort, _order, _mode)) # fetch one more than needed to see if there's a "next page". es_search = es_search.extra(size=pagination['size'] + 1) return es_search
def get_summary_statistics(): """ Obtains statistics about current sum of flows, packets, bytes. :return: JSON with status "ok" or "error" and requested data. """ try: # Elastic query client = elasticsearch.Elasticsearch([{ 'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port') }]) elastic_bool = [] elastic_bool.append( {'range': { '@timestamp': { 'gte': "now-5m", 'lte': "now" } }}) elastic_bool.append({'term': {'@type': 'protocols_statistics'}}) qx = Q({'bool': {'must': elastic_bool}}) s = Search(using=client, index='_all').query(qx) s.aggs.bucket('sum_of_flows', 'sum', field='flows') s.aggs.bucket('sum_of_packets', 'sum', field='packets') s.aggs.bucket('sum_of_bytes', 'sum', field='bytes') s.sort('@timestamp') result = s.execute() # Result Parsing into CSV in format: timestamp, tcp protocol value, udp protocol value data = "Timestamp, Flows, Packets, Bytes;" timestamp = "Last 5 Minutes" data += timestamp + ', ' +\ str(int(result.aggregations.sum_of_flows['value'])) + ', ' +\ str(int(result.aggregations.sum_of_packets['value'])) + ', ' +\ str(int(result.aggregations.sum_of_bytes['value'])) json_response = '{"status": "Ok", "data": "' + data + '"}' return json_response except Exception as e: json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape( str(e)) + '"}' return json_response
def search_more_like_this(talk): """ Get more like this documents """ client = Elasticsearch([{ 'host': settings.ELASTICSEARCH['default']['HOSTNAME'], 'port': settings.ELASTICSEARCH['default']['PORT'], }]) s = Search(using=client, index="vtalks") s = s.query( MoreLikeThis(like={ "_index": "vtalks", "_type": "talk", "_id": talk.id }, fields=['title', 'description', 'tags'])) # Sorting s = s.sort({"_score": {"order": "desc"}}) # Fields selection s = s.source(['id']) response = s.execute() results_total = response.hits.total results_ids = [hit.id for hit in response.hits] return results_total, results_ids
def search(self, **params): index = params.get('index', self.index) search = Search(using=self.client, index=index) page = params.get('page', None) per_page = params.get('per_page', None) if page and per_page: page = page - 1 search._extra = {'from': page, 'size': per_page} sort = params.get('sort', None) if sort and sort.replace('-', '') in ['created_at', 'level']: search = search.sort(sort) date_filter = self._filter_by_date_interval(params) if date_filter: search = search.filter(date_filter) level = params.get('group_by', None) if level: search = search.query('match', level=level) hits = search.execute() format = params.get('format', 'object') if format == 'dict': return self._to_dict(hits) else: return self._to_logs(hits)
def _create_search(dbm, form_model, local_time_delta, pagination_params, sort_params, search_parameters): es = Elasticsearch(hosts=[{ "host": ELASTIC_SEARCH_HOST, "port": ELASTIC_SEARCH_PORT }]) search = Search(using=es, index=dbm.database_name, doc_type=form_model.id) search = search.sort(sort_params) search = search.extra(**pagination_params) search = search.query('match', status='Success') search = search.query('term', void=False) if search_parameters.get('data_sender_filter'): search = search.query( "term", **{"datasender.id": search_parameters.get('data_sender_filter')}) if search_parameters.get('unique_id_filters'): search = _add_unique_id_filters( form_model, search_parameters.get('unique_id_filters'), search) if search_parameters.get('date_question_filters'): for key, values in search_parameters.get( 'date_question_filters').iteritems(): query = DateQuestionRangeFilter(values['dateRange'], form_model, key).build_filter_query() if query is not None: search = search.query(query) if search_parameters.get('search_text'): query_text_escaped = ElasticUtilsHelper().replace_special_chars( search_parameters.get('search_text')) search = search.query("query_string", query=query_text_escaped) submission_date_query = SubmissionDateRangeFilter( search_parameters.get('submission_date_range'), local_time_delta).build_filter_query() if submission_date_query: search = search.query(submission_date_query) return search
def make_query(query, filters, page, sort_by): try: client = Elasticsearch() s = Search(client, index=app.config['INDEX']) if query: s = s.query(QueryString(query=escape_query(query))) if not sort_by: sort_by = "relevance" else: s = s.query(MatchAll()) if not sort_by: sort_by = DEFAULT_SORT_BY s = s.sort(SORT_BY.get(sort_by, DEFAULT_SORT_BY)['value']) start = (page - 1) * 20 end = start + 20 s = s[start:end] if filters: s = s.filter('bool', must=filters) result = s.execute() return result except ConnectionError, ex: return None
def consensus(offset=60): """ check for 'eth.chain.new_head' messages and return the max number of clients, that had the same head during the last `offset` seconds. """ s = Search(client) # s = s.query(Q('match', message='eth.chain.new_head')) s = s.filter('exists', field='json_message.eth.chain.new_head.block_number') s = s.sort({'json_message.eth.chain.new_head.ts': {'order': 'desc', 'ignore_unmapped': 'true'}}) response = s.execute() # Get latest block number x = max(hit['_source']['json_message']['eth.chain.new_head']['block_number'] for hit in response.hits.hits) # By default, the buckets are ordered by their doc_count descending # s.aggs.bucket('by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3) # Reach consensus around latest block number s = Search(client) s = s.filter(time_range_filter(field="json_message.eth.chain.new_head.ts", offset=offset)) s.aggs.bucket('latest', 'range', field='json_message.eth.chain.new_head.block_number', ranges=[{"from": x - 1, "to": x + 1}]).bucket( 'by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3) # s = s[10:10] response = s.execute() # pprint(response) if response: return max(tag.doc_count for tag in response.aggregations.latest.buckets[0].by_block_hash.buckets) else: return 0
def main(): """Query ES to get first and last commit of each author together with some extra info like . """ es_conn = create_conn() # Create search object s = Search(using=es_conn, index='git') # FILTER: retrieve commits before given year s = s.filter('range', grimoire_creation_date={'lt': 'now/y'}) # Bucketize by uuid and get first and last commit (commit date is stored in # author_date field) s.aggs.bucket('authors', 'terms', field='author_uuid', size=10000000) \ .metric('first', 'top_hits', _source=['author_date', 'author_org_name', 'author_uuid', 'project'], size=1, sort=[{"author_date": {"order": "asc"}}]) \ .metric('last_commit', 'max', field='author_date') # Sort by commit date s = s.sort("author_date") #print(s.to_dict()) result = s.execute() # Print result print(json.dumps(result.to_dict()['aggregations'], indent=2, sort_keys=True))
def build_file_es(self, args, search_condition): file_es = Search() \ .query(search_condition) file_es = file_es.sort(*self.sort_condition(args)) file_es = self.add_custom_source(file_es, args) file_es = self.add_page_limit_to_file_es(args, file_es) return file_es
def fetch_incidents(): last_fetch, last_fetch_timestamp = get_last_fetch_time() es = elasticsearch_builder() query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*") # Elastic search can use epoch timestamps (in milliseconds) as date representation regardless of date format. search = Search(using=es, index=FETCH_INDEX).filter( {'range': { TIME_FIELD: { 'gt': last_fetch_timestamp } }}) search = search.sort({TIME_FIELD: { 'order': 'asc' }})[0:FETCH_SIZE].query(query) response = search.execute().to_dict() _, total_results = get_total_results(response) incidents = [] # type: List if total_results > 0: if 'Timestamp' in TIME_METHOD: incidents, last_fetch = results_to_incidents_timestamp( response, last_fetch) demisto.setLastRun({'time': last_fetch}) else: incidents, last_fetch = results_to_incidents_datetime( response, last_fetch) demisto.setLastRun({'time': str(last_fetch)}) demisto.info('extract {} incidents'.format(len(incidents))) demisto.incidents(incidents)
def search(self, query): skip = int(query.pop("skip", 0)) s = Search(using=es, index=config['index']['resolver']) if self.uuid: s = s.query('match', _dataset=self.uuid) if self.core: s = s.query('match', _core=self.core) for k, v in query.iteritems(): options = config['search'].get(k, {}) if not v or str(v) == "": continue if ".kw" in k or options.get('type') == "keyword": s = s.filter('term', **{k.replace(".term", ""): v}) elif ".prefix" in k or options.get('type') == "prefix": s = s.query('prefix', **{k.replace(".prefix", ""): v.lower()}) elif ".fuzzy" in k or options.get('type') == "fuzzy": s = s.query('fuzzy', **{k.replace(".fuzzy", ""): v.lower()}) elif ".term" in k or options.get("type") == "term": s = s.query('term', **{k.replace(".term", ""): v.lower()}) else: s = s.query('match', **{k: v}) a = A('geo_bounds', field='_location') s.aggs.bucket('viewport', a) s = s.sort('_id') s = s[skip:skip + 50] result = s.execute() query['skip'] = skip return result
def events(): """Returns list of Event objects. @returns: list """ s = Search(doc_type='events')[0:MAX_SIZE] s = s.sort('start_date') s = s.fields([ 'id', 'published', 'title', 'description', 'start_date', 'end_date', 'article_title', 'resource_uri', ]) response = s.execute() data = [ Event( id = hit.meta.id, published = hitvalue(hit, 'published'), title = hitvalue(hit, 'title'), description = hitvalue(hit, 'description'), start_date = hitvalue(hit, 'start_date'), end_date = hitvalue(hit, 'end_date'), article_title = hitvalue(hit, 'article_title'), resource_uri = hitvalue(hit, 'resource_uri'), ) for hit in response ] return data
def get_elk_response(self, request, task_id): # FIXME try to use django-rest-elasticsearch instead page = int(request.GET.get('page')) if request.GET.get('page') else 0 limit = int( request.GET.get('limit')) if request.GET.get('limit') else 20 if request.GET.get('order'): field, order = request.GET.get('order').split(',') sort_option = {field: {"order": order}} else: sort_option = {} client = Elasticsearch(hosts=[ELK_HOST + ':9200'], http_auth=('elastic', 'L5M3LPXk6QhxTyZenwo5')) s = Search(using=client, index="logstash*").query("match", task_id=task_id) if request.GET.get('category'): s = s.query( Match(categories={"query": request.GET.get('category')})) s = s.sort(sort_option)[(page * limit):(page * limit + limit)] try: elk_response = s.execute() except RequestError as exc: logging.warning(exc) return Response({"Message": "Wrong query!"}) return elk_response, limit, page
def get(self, request, *args, **kwargs): q = None page = 0 page_size = 10 order_fields = [] for key,value in request.GET.items(): if key in ('tag','content','autocomplete'): q = Q('match', tag=value) if key == 'page': page = int(value) - 1 if int(value) > 0 else 0 if key == 'page_size': page_size = int(value) if key in ('order', 'orderby'): order_fields = value.split(',') if page_size > 1000: page_size = 1000 s = Search(index='galaxy_tags') s = s.query(q) if q else s s = s.sort(*order_fields) if len(order_fields) > 0 else s s = s[page * page_size:page * page_size + page_size] result = s.execute() serializer = ElasticSearchDSLSerializer(result.hits, many=True) response = get_response(request=request, result=result, view='api:tags_search_view') response['results'] = serializer.data return Response(response)
def search(self, index: str, query_string: QueryString, search_from: int, size: int, sort_by: dict = None, fields: list = None): results = [] more = False search = Search( using=self.client, index=index).query(query_string).source(fields)[search_from:size] search = search.sort(sort_by) search_results = search.execute().to_dict() total = int(search_results['hits']['total']['value']) for result in search_results['hits']['hits']: response = {} response.update(result['_source']) results.append(response) if -1 < size < total: more = True return { "more": more, "objects": results, }
def generateRawQuery(user, starttime, endtime): """ Generate the raw query to get all usage for a user between starttime and endtime. """ es = Elasticsearch( [GRACC], timeout=300, use_ssl=True, verify_certs=True ) endtime = endtime + datetime.timedelta(days=1) MAXSZ = 2 ** 30 index = "gracc.osg.raw*" s = Search(using=es, index=index) # Starttime and endtime are both datetime objects print("Querying for user {} between {} and {}".format(user, starttime, endtime)) s = s.query( "bool", filter=[ Q("range", EndTime={"gte": starttime, "lt": endtime}) & Q("term", ResourceType="Payload") & Q("term", DN=user) ], ) s = s.sort("StartTime") #print(s.to_dict()) return s
def locations(): """Returns list of Location objects. @returns: list """ s = Search(doc_type='location')[0:MAX_SIZE] s = s.sort('id') s = s.fields([ 'id', 'category', 'title', 'location_name', 'description', 'lat', 'lng', 'resource_uri', 'location_uri', 'location_url', ]) response = s.execute() return [ Location( id = hitvalue(hit, 'id'), category = hitvalue(hit, 'category'), title = hitvalue(hit, 'title'), location_name = hitvalue(hit, 'location_name'), description = hitvalue(hit, 'description'), lat = hitvalue(hit, 'lat'), lng = hitvalue(hit, 'lng'), resource_uri = hitvalue(hit, 'resource_uri'), location_uri = hitvalue(hit, 'location_uri'), location_url = hitvalue(hit, 'location_url'), ) for hit in response ]
def test_paginator(self): search = Search(index=Token.es_doc_type._doc_type.index, doc_type=Token.es_doc_type._doc_type.name) search = search.sort('name') page_size = 2 paginator = ESSearchPaginator(search, page_size) page = paginator.page(1) self.assertTrue(page.has_other_pages) self.assertEqual(len(page.hits), page_size) self.assertEqual(page.total_count, 3) self.assertEqual(page.hits[0]['name'], 'token 0') self.assertEqual(page.hits[1]['name'], 'token 1') self.assertEqual(page.paginator, paginator) self.assertEqual(page.number, 1) self.assertIsNotNone(page.response) page = paginator.page(2) self.assertFalse(page.has_other_pages) self.assertEqual(len(page.hits), 1) self.assertEqual(page.hits[0]['name'], 'token 2')
def pages(): """Returns list of published light Page objects. @returns: list """ KEY = 'encyc-front:pages' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='articles').filter('term', published_encyc=True)[0:MAX_SIZE] s = s.sort('title_sort') s = s.fields([ 'url_title', 'title', 'title_sort', 'published', 'modified', 'categories', ]) response = s.execute() data = [ Page( url_title = hitvalue(hit, 'url_title'), title = hitvalue(hit, 'title'), title_sort = hitvalue(hit, 'title_sort'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), categories = hit.get('categories',[]), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) return data
def authors(num_columns=None): """Returns list of published light Author objects. @returns: list """ KEY = 'encyc-front:authors' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='authors')[0:MAX_SIZE] s = s.sort('title_sort') s = s.fields([ 'url_title', 'title', 'title_sort', 'published', 'modified', ]) response = s.execute() data = [ Author( url_title = hitvalue(hit, 'url_title'), title = hitvalue(hit, 'title'), title_sort = hitvalue(hit, 'title_sort'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) if num_columns: return _columnizer(data, num_columns) return data
def session_times(): # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"} start_message = 'scenario.p2p_connect.starting.clients.sequentially' stop_message = 'scenario.p2p_connect.stopping.clients' s = Search(client) s = s.filter('bool', should=[F('term', message=start_message), F('term', message=stop_message)]) s = s.fields(['message', '@timestamp']) s = s[0:100000] s = s.sort('-@timestamp') # desc, we want the latest events response = s.execute() events = [] # joungest to oldest, last should be a stop message for h in response: msg = 'start' if h['message'][0] == start_message else 'stop' ts = h['@timestamp'][0] events.append((msg, ts)) assert not events or events[0][0] == 'stop' sessions = [] while len(events) >= 2: stop = events.pop() start = events.pop() sessions.append(dict([start, stop])) return list(reversed(sessions))
def load_filtered_top_ko_associations_search_after(filters, search_after = '', size=50): """Retrieves top associations and filter them through the tickable options""" s = Search(using=es, doc_type='ko_associations') s = s.sort('-score', '_uid') # By default, leave out associations with no gene s = s.filter(Q({'nested':{'path':'gene', 'query':{'exists':{'field':'gene.chr'}}}})) # # Only need to filter by chromosome, maf or mac if 'chr' in filters and len(filters['chr']) > 0 and len(filters['chr']) < 5: s = s.filter(Q('bool', should=[Q({'nested':{'path':'gene', 'query':{'match':{'gene.chr':chrom if len(chrom) > 3 else 'chr%s' % chrom}}}}) for chrom in filters['chr']])) if 'significant' in filters: s = s.filter(Q('range', mac={'gte': 6})) s = s.filter('term', overBonferroni='T') # TODO: change this to permutation once the new indexed scores are in. if search_after != '': search_after = parse_lastel(search_after) print(search_after) s = s.extra(search_after=search_after) s = s[0:size] result = s.execute() associations = result['hits']['hits'] last_el = result['hits']['hits'][-1]['sort'] # Transformation needed to saveguard url transmition last_el[1] = "-".join(last_el[1].split('#')) return [association['_source'].to_dict() for association in associations], result['hits']['total'], last_el
def get_outbound_refs( es_client: Any, release_ident: Optional[str] = None, work_ident: Optional[str] = None, wikipedia_article: Optional[str] = None, limit: int = 100, offset: Optional[int] = None, es_index: str = "fatcat_ref", ) -> RefHits: search = Search(using=es_client, index=es_index) if release_ident: search = search.filter("term", source_release_ident=release_ident) elif work_ident: search = search.filter("term", source_work_ident=work_ident) elif wikipedia_article: search = search.filter("term", source_wikipedia_article=wikipedia_article) else: raise ValueError("require a lookup key") search = search.sort("ref_index") # re-sort by index hits = _execute_ref_query(search, limit=limit, offset=offset) hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) return hits
def get200query(indexbase, my200page, agency, domaintype, org, mimetype, query): index = indexbase + '-200scanner' s = Search(using=es, index=index) s = s.sort('domain') if query is None: # produce an empty query s = s.query(~Q('match_all')) else: if my200page == 'All Scans': s = s.query('simple_query_string', query=query) else: field = 'data.' + deperiodize(my200page) s = s.query('query_string', query=query, fields=[field]) if agency != 'All Agencies' and agency is not None: agencyquery = '"' + agency + '"' s = s.query("query_string", query=agencyquery, fields=['agency']) if domaintype != 'All Branches' and domaintype is not None: domaintypequery = '"' + domaintype + '"' s = s.query("query_string", query=domaintypequery, fields=['domaintype']) if org != 'All Organizations' and org is not None: orgquery = '"' + org + '"' s = s.query("query_string", query=orgquery, fields=['organization']) # filter with data derived from the pagedata index (if needed) pagedatadomains = [] if mimetype != 'all content_types': domains = domainsWith(my200page, 'content_type', mimetype, indexbase + '-pagedata') pagedatadomains.extend(domains) if len(pagedatadomains) > 0: s = s.filter("terms", domain=pagedatadomains) return s
def get_trade_history(size=10, from_date='2015-10-10', to_date='now', sort_by='-operation_id_num', search_after=None, base="1.3.0", quote="1.3.121"): s = Search(using=es, index="bitshares-*") s = s.extra(size=size) if search_after and search_after != '': s = s.extra(search_after=search_after.split(',')) q = Q() q = q & Q("match", operation_type=4) q = q & Q("match", operation_history__op_object__is_maker=True) q = q & Q("match", operation_history__op_object__fill_price__base__asset_id=base) q = q & Q("match", operation_history__op_object__fill_price__quote__asset_id=quote) range_query = Q("range", block_data__block_time={ 'gte': from_date, 'lte': to_date }) s.query = q & range_query s = s.sort(*sort_by.split(',')) response = s.execute() verify_es_response(response) return [hit.to_dict() for hit in response]
def build_product_es(self, args, product_search_condition, sources): product_es = Search() \ .query(product_search_condition) \ .source(['sku'] if args.get('only_sku') else sources) \ .extra(track_total_hits=True) product_es = product_es.sort(*self.sort_condition(args)) return product_es
def search_talks(page=None, sort=None): """ Get Talks from by Topic from ElasticSearch """ client = Elasticsearch([{ 'host': settings.ELASTICSEARCH['default']['HOSTNAME'], 'port': settings.ELASTICSEARCH['default']['PORT'], }]) s = Search(using=client, index="vtalks") # Pagination if page: start = 0 end = 10 if page > 1: start = settings.PAGE_SIZE * (page - 1) end = settings.PAGE_SIZE * page s = s[start:end] # Sorting s = s.sort({sort: {"order": "desc"}}) # Fields selection s = s.source(['id']) response = s.execute() results_total = response.hits.total results_ids = [hit.id for hit in response.hits] return results_total, results_ids
def sources(): """Returns list of published light Source objects. @returns: list """ KEY = 'encyc-front:sources' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='sources')[0:MAX_SIZE] s = s.sort('encyclopedia_id') s = s.fields([ 'encyclopedia_id', 'published', 'modified', 'headword', 'media_format', 'img_path', ]) response = s.execute() data = [ Source( encyclopedia_id = hitvalue(hit, 'encyclopedia_id'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), headword = hitvalue(hit, 'headword'), media_format = hitvalue(hit, 'media_format'), img_path = hitvalue(hit, 'img_path'), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) return data
def get_all(sort_by=None, start=0, limit=10, cve_id=None) -> List[Plugin]: search = Search(using=current_app.elasticsearch, index=PLUGINS_INDEX)[start:limit] if cve_id: search = search.query("term", cvelist__keyword=cve_id) if sort_by: if sort_by == "score": search = search.sort({f"cvss.{sort_by}": {"order": "desc"}}) else: search = search.sort({f"{sort_by}": {"order": "desc"}}) response = search.execute() response = [ PluginsService._parse_single_result(result) for result in response.hits ] return response
def session_times(): # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"} start_message = 'scenario.p2p_connect.starting.clients' stop_message = 'scenario.p2p_connect.stopping.clients' s = Search(client) s = s.filter('bool', should=[ F('term', at_message=start_message), F('term', at_message=stop_message) ]) s = s.fields(['@message', '@timestamp']) s = s[0:100000] s = s.sort('-@timestamp') # desc, we want the latest events response = s.execute() events = [] # joungest to oldest, last should be a stop message for h in response: msg = 'start' if h['@message'][0] == start_message else 'stop' ts = h['@timestamp'][0] events.append((msg, ts)) assert not events or events[0][0] == 'stop' sessions = [] while len(events) >= 2: stop = events.pop() start = events.pop() sessions.append(dict([start, stop])) return list(reversed(sessions))
def fetch(session): s = Search(client) s = s.filter('bool', should=[F('term', message='p2p.disconnected'), F('term', message='p2p.connected')]) s = s.filter('range', **{'@timestamp': dict(gte=session['start'], lte=session['stop'])}) s = s.fields(['json_message.p2p.connected.remote_id', 'guid', 'message', '@timestamp']) s = s[0:100000] # s = s[0:10] s = s.sort('@timestamp') response = s.execute() return response
def search(self, **params): limit_cat = params.get('cat', "").strip() limit_forum = params.get('forum', "").strip() limit_count = int(params.get('count', 100)) limit_size_min = human2bytes(params.get('min', "0b")) limit_size_max = human2bytes(params.get('max', "0b")) limit_wild = int(params.get('wild', 0)) arg = params.get('query', '').strip() if not arg: arg = "hobbit" s = Search(using=es, index=ela_index) if limit_size_min: s = s.filter("range", size = {'gte' : limit_size_min }) if limit_size_max: s = s.filter("range", size = {'lte' : limit_size_max }) arg = arg.split(' ') if limit_wild: q = Q("wildcard", name="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", name="*"+a+"*") else: q = Q("match", name=arg.pop(0)) for a in arg: q = q & Q("match", name=a) if len(limit_cat): for a in limit_cat.split(' '): q = q & Q("match", category=a) if len(limit_forum): for a in limit_forum.split(' '): q = q & Q("match", forum=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total #cherrypy.log("query have "+str(size)+" elements") if size > limit_count: size = limit_count s = s.sort('-size') s = s.extra(size=size) r = s.execute() data = [] for b in r: a = [b.id, b.size, b.name, b.category, b.forum, b.date[0] if b.date else '', b.hash] data.append(a) return {'data': data}
def search(self, **params): limit_author = params.get('author', "").strip() limit_title = params.get('title', "").strip() limit_count = int(params.get('count', 10)) limit_wild = int(params.get('wild', 0)) q = None if not limit_author and not limit_title: limit_title = "hobbit" s = Search(using=es, index=ela_index) arg = limit_title.split(' ') arg = [x for x in arg if x] if len(arg): if limit_wild: q = Q("wildcard", title="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", title="*"+a+"*") else: q = Q("match", title=arg.pop(0)) for a in arg: q = q & Q("match", title=a) arg = limit_author.split(' ') arg = [x for x in arg if x] if len(arg): for a in arg: if q: q = q & Q("match", author=a) else: q = Q("match", author=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total if size > limit_count: size = limit_count s = s.sort('-date') s = s.extra(size=size) r = s.execute() #cherrypy.log("result is "+str(r)) data = [] for b in r: a = [b.id, b.author, b.title, b.size, b.date] data.append(a) return {'data': data}
def mapcategories(num_columns=None): """Returns list of MapCategory objects. @returns: list """ s = Search(doc_type='mapcategory')[0:MAX_SIZE] s = s.sort('id') s = s.fields([ 'id', 'title', ]) response = s.execute() return [ MapCategory( id = hitvalue(hit, 'id'), title = hitvalue(hit, 'title'), ) for hit in response ]
def convert_filters_to_query(self, filters): s = Search(using=self.client) spec = filters.get("dataset__spec", None) modified_from = filters.get('modified__gt', None) modified_until = filters.get('modified__lt', None) if spec and not self.spec: self.spec = spec if self.spec: s = s.query("match", **{'system.spec.raw': self.spec}) if self.query: if 'query' in self.query: s = s.query(self.query.get('query')) if 'filter' in self.query: s = s.query(self.query.get('filter')) if modified_from: s = s.filter("range", **{"system.modified_at": {"gte": modified_from}}) if modified_until: s = s.filter("range", **{"system.modified_at": {"lte": modified_until}}) s = s.sort({"system.modified_at": {"order": "asc"}}) return s[self.cursor: self.get_next_cursor()]
def search( hosts, index, query_type='multi_match', query='', filters={}, sort='m_pseudoid', start=0, pagesize=10 ): """Constructs Search object Note: allows any combination of filters, even illogical ones @param hosts: list settings.DOCSTORE_HOSTS @param index: elasticsearch_dsl.Index @param query_type: str Name of query type. @param query: str Query string. @param filters: dict Filters and their arguments. @param sort: str Name of field on which to sort. @param start: int Start of result set. @param pagesize: int Number of records to return. @returns: elasticsearch_dsl.Search """ ## remove empty filter args #filter_args = {key:val for key,val in filters.items() if val} #if not (query or filter_args): # return None,[] s = Search(using=ES, index=index) s = s.doc_type(Record) if filters: for field,values in filters.items(): if values: # multiple terms for a field are OR-ed s = s.filter('terms', **{field: values}) if query: s = s.query( query_type, query=query, fields=definitions.FIELDS_MASTER ) # aggregations if filters: for field in filters.keys(): s.aggs.bucket(field, 'terms', field=field, size=1000) s = s.fields(definitions.FIELDS_MASTER) s = s.sort(sort) s = s[start:start+pagesize] return s
class TopologyData(object): """A base class used by models that are really Elasticsearch entries, and not db tables.""" _DOC_TYPE = "" _INDEX_PREFIX = "" def __init__(self): self.conn = es_conn() self.search = Search(self.conn) # Using the private setters over methods simplifies mocking for # unit tests. # pylint: disable=W0212 self.search._doc_type = self._DOC_TYPE self.search._index = es_indices(self._INDEX_PREFIX, self.conn) @classmethod def _sort_arg(cls, key, order): """Return key as, key or -key, depending on the sort order.""" if order in ["+", "asc"]: return key # translates to [{key: {'order': 'asc'}}] elif order in ["-", "desc"]: return "-" + key # translates to [{key: {'order': 'desc'}}] else: raise ValueError("Valid order values are in [+, -, asc, desc]") def get(self, count=1, sort_key="@timestamp", sort_order="desc"): """Return the latest n instances from ES or None if not found.""" from elasticsearch import ElasticsearchException try: self.search.sort(self._sort_arg(sort_key, sort_order)) self.search = self.search[0:count] logger.debug("[get] search = %s", self.search.to_dict()) # pylint: disable=W0212 logger.debug("[get] index = %s", self.search._index) logger.debug("[get] doc_type = %s", self._DOC_TYPE) return self.search.execute() except ElasticsearchException as exc: logger.debug("get from ES failed, exception was %s", exc.message) raise except ValueError as exc: logger.exception(exc) raise def post(self, body, **_): """Post a record to the database. :arg body: record body as JSON object :arg _: Unused. :return: id of the inserted record """ logger.debug("post called with body = %s", json.dumps(body)) response = self.conn.create( daily_index(self._INDEX_PREFIX), self._DOC_TYPE, body, refresh=True) logger.debug('[post] response = %s', json.dumps(response)) return response['_id']
class EsSearch(object): def __init__(self, families, previous_search_results=None): self._client = get_es_client() self.samples_by_family_index = defaultdict(lambda: defaultdict(dict)) for s in get_latest_loaded_samples(families): self.samples_by_family_index[s.elasticsearch_index][s.individual.family.guid][s.sample_id] = s if len(self.samples_by_family_index) < 1: raise InvalidIndexException('No es index found') self._set_index_metadata() if len(self.samples_by_family_index) != len(self.index_metadata): raise InvalidIndexException('Could not find expected indices: {}'.format( ', '.join(set(self.samples_by_family_index.keys()) - set(self.index_metadata.keys())) )) self.previous_search_results = previous_search_results or {} self._search = Search() self._index_searches = defaultdict(list) self._sort = None self._allowed_consequences = None def _set_index_metadata(self): self.index_metadata = get_index_metadata(','.join(self.samples_by_family_index.keys()), self._client) def filter(self, new_filter): self._search = self._search.filter(new_filter) return self def sort(self, sort): self._sort = _get_sort(sort) self._search = self._search.sort(*self._sort) def filter_by_annotations(self, annotations, pathogenicity_filter): consequences_filter, allowed_consequences = _annotations_filter(annotations) if allowed_consequences: if pathogenicity_filter: consequences_filter |= pathogenicity_filter self.filter(consequences_filter) self._allowed_consequences = allowed_consequences def filter_by_genotype(self, inheritance, quality_filter=None): has_previous_compound_hets = self.previous_search_results.get('grouped_results') inheritance_mode = (inheritance or {}).get('mode') inheritance_filter = (inheritance or {}).get('filter') or {} if inheritance_filter.get('genotype'): inheritance_mode = None quality_filter = dict({'min_ab': 0, 'min_gq': 0}, **(quality_filter or {})) if quality_filter['min_ab'] % 5 != 0: raise Exception('Invalid ab filter {}'.format(quality_filter['min_ab'])) if quality_filter['min_gq'] % 5 != 0: raise Exception('Invalid gq filter {}'.format(quality_filter['min_gq'])) if quality_filter and quality_filter.get('vcf_filter') is not None: self.filter(~Q('exists', field='filters')) for index, family_samples_by_id in self.samples_by_family_index.items(): if not inheritance and not quality_filter['min_ab'] and not quality_filter['min_gq']: search_sample_count = sum(len(samples) for samples in family_samples_by_id.values()) index_sample_count = Sample.objects.filter(elasticsearch_index=index).count() if search_sample_count == index_sample_count: # If searching across all families in an index with no inheritance mode we do not need to explicitly # filter on inheritance, as all variants have some inheritance for at least one family continue genotypes_q = _genotype_inheritance_filter( inheritance_mode, inheritance_filter, family_samples_by_id, quality_filter, ) compound_het_q = None if inheritance_mode == COMPOUND_HET: compound_het_q = genotypes_q else: self._index_searches[index].append(self._search.filter(genotypes_q)) if inheritance_mode == RECESSIVE: compound_het_q = _genotype_inheritance_filter( COMPOUND_HET, inheritance_filter, family_samples_by_id, quality_filter, ) if compound_het_q and not has_previous_compound_hets: compound_het_search = self._search.filter(compound_het_q) compound_het_search.aggs.bucket( 'genes', 'terms', field='geneIds', min_doc_count=2, size=MAX_COMPOUND_HET_GENES+1 ).metric( 'vars_by_gene', 'top_hits', size=100, sort=self._sort, _source=QUERY_FIELD_NAMES ) self._index_searches[index].append(compound_het_search) def search(self, page=1, num_results=100): indices = self.samples_by_family_index.keys() logger.info('Searching in elasticsearch indices: {}'.format(', '.join(indices))) num_loaded = len(self.previous_search_results.get('all_results', [])) if len(indices) == 1 \ and len(self._index_searches.get(indices[0], [])) <= 1 \ and not self.previous_search_results.get('grouped_results'): start_index = None if (page - 1) * num_results < num_loaded: start_index = num_loaded return self._execute_single_search(page, num_results, start_index=start_index) elif not self._index_searches: # If doing all project-families all inheritance search, do it as a single query # Load all variants, do not skip pages num_loaded += self.previous_search_results.get('duplicate_doc_count', 0) if num_loaded >= (page-1)*num_results: start_index = num_loaded else: start_index = 0 return self._execute_single_search( page, num_results, start_index=start_index, deduplicate=True ) else: return self._execute_multi_search(page, num_results) def _execute_single_search(self, page, num_results, deduplicate=False, start_index=None): index_name = ','.join(self.samples_by_family_index.keys()) search = self._get_paginated_searches( index_name, page, num_results*len(self.samples_by_family_index), start_index=start_index )[0] response = self._execute_search(search) variant_results, total_results, is_compound_het = self._parse_response(response) self.previous_search_results['total_results'] = total_results results_start_index = (page - 1) * num_results if is_compound_het: variant_results = _sort_compound_hets(variant_results) self.previous_search_results['grouped_results'] = variant_results end_index = min(results_start_index + num_results, total_results) return _get_compound_het_page(variant_results, results_start_index, end_index) if deduplicate: variant_results = self._deduplicate_results(variant_results) # Only save contiguous pages of results: previous_all_results = self.previous_search_results.get('all_results', []) if len(previous_all_results) >= results_start_index: self.previous_search_results['all_results'] = self.previous_search_results.get('all_results', []) + variant_results variant_results = self.previous_search_results['all_results'][results_start_index:] return variant_results[:num_results] def _execute_multi_search(self, page, num_results): indices = self.samples_by_family_index.keys() if not self.previous_search_results.get('loaded_variant_counts'): self.previous_search_results['loaded_variant_counts'] = {} ms = MultiSearch() for index_name in indices: start_index = 0 if self.previous_search_results['loaded_variant_counts'].get(index_name): index_total = self.previous_search_results['loaded_variant_counts'][index_name]['total'] start_index = self.previous_search_results['loaded_variant_counts'][index_name]['loaded'] if start_index >= index_total: continue else: self.previous_search_results['loaded_variant_counts'][index_name] = {'loaded': 0, 'total': 0} searches = self._get_paginated_searches(index_name, page, num_results, start_index=start_index) ms = ms.index(index_name) for search in searches: ms = ms.add(search) responses = self._execute_search(ms) new_results = [] compound_het_results = self.previous_search_results.get('compound_het_results', []) for response in responses: response_hits, response_total, is_compound_het = self._parse_response(response) if not response_total: continue index_name = response.hits[0].meta.index if is_compound_het: compound_het_results += response_hits self.previous_search_results['loaded_variant_counts']['{}_compound_het'.format(index_name)] = {'total': response_total} else: new_results += response_hits self.previous_search_results['loaded_variant_counts'][index_name]['total'] = response_total self.previous_search_results['loaded_variant_counts'][index_name]['loaded'] += len(response_hits) self.previous_search_results['total_results'] = sum(counts['total'] for counts in self.previous_search_results['loaded_variant_counts'].values()) # combine new results with unsorted previously loaded results to correctly sort/paginate all_loaded_results = self.previous_search_results.get('all_results', []) previous_page_record_count = (page - 1) * num_results if len(all_loaded_results) >= previous_page_record_count: loaded_results = all_loaded_results[:previous_page_record_count] new_results += all_loaded_results[previous_page_record_count:] else: loaded_results = [] new_results += self.previous_search_results.get('variant_results', []) new_results = sorted(new_results, key=lambda variant: variant['_sort']) variant_results = self._deduplicate_results(new_results) if compound_het_results or self.previous_search_results.get('grouped_results'): if compound_het_results: compound_het_results = self._deduplicate_compound_het_results(compound_het_results) return self._process_compound_hets(compound_het_results, variant_results, num_results) else: self.previous_search_results['all_results'] = loaded_results + variant_results return variant_results[:num_results] def _get_paginated_searches(self, index_name, page, num_results, start_index=None): searches = [] for search in self._index_searches.get(index_name, [self._search]): search = search.index(index_name) if search.aggs.to_dict(): # For compound het search get results from aggregation instead of top level hits search = search[:1] logger.info('Loading compound hets for {}'.format(index_name)) else: end_index = page * num_results if start_index is None: start_index = end_index - num_results search = search[start_index:end_index] search = search.source(QUERY_FIELD_NAMES) logger.info('Loading {} records {}-{}'.format(index_name, start_index, end_index)) searches.append(search) return searches def _execute_search(self, search): logger.debug(json.dumps(search.to_dict(), indent=2)) try: return search.using(self._client).execute() except elasticsearch.exceptions.ConnectionTimeout as e: canceled = self._delete_long_running_tasks() logger.error('ES Query Timeout. Canceled {} long running searches'.format(canceled)) raise e def _parse_response(self, response): if hasattr(response.aggregations, 'genes') and response.hits: response_hits, response_total = self._parse_compound_het_response(response) return response_hits, response_total, True response_total = response.hits.total logger.info('Total hits: {} ({} seconds)'.format(response_total, response.took / 1000.0)) return [self._parse_hit(hit) for hit in response], response_total, False def _parse_compound_het_response(self, response): if len(response.aggregations.genes.buckets) > MAX_COMPOUND_HET_GENES: raise Exception('This search returned too many compound heterozygous variants. Please add stricter filters') index_name = response.hits[0].meta.index family_unaffected_individual_guids = { family_guid: {sample.individual.guid for sample in samples_by_id.values() if sample.individual.affected == UNAFFECTED} for family_guid, samples_by_id in self.samples_by_family_index[index_name].items() } variants_by_gene = {} for gene_agg in response.aggregations.genes.buckets: gene_variants = [self._parse_hit(hit) for hit in gene_agg['vars_by_gene']] gene_id = gene_agg['key'] if gene_id in variants_by_gene: continue if self._allowed_consequences: # Variants are returned if any transcripts have the filtered consequence, but to be compound het # the filtered consequence needs to be present in at least one transcript in the gene of interest gene_variants = [variant for variant in gene_variants if any( transcript['majorConsequence'] in self._allowed_consequences for transcript in variant['transcripts'][gene_id] )] if len(gene_variants) < 2: continue # Do not include groups multiple times if identical variants are in the same multiple genes if any(variant['mainTranscript']['geneId'] != gene_id for variant in gene_variants): primary_genes = [variant['mainTranscript']['geneId'] for variant in gene_variants] if all(gene == primary_genes[0] for gene in primary_genes): is_valid_gene = True if self._allowed_consequences: is_valid_gene = all(any( transcript['majorConsequence'] in self._allowed_consequences for transcript in variant['transcripts'][primary_genes[0]] ) for variant in gene_variants) if is_valid_gene: gene_id = primary_genes[0] if gene_id in variants_by_gene: continue else: variant_ids = [variant['variantId'] for variant in gene_variants] for gene in set(primary_genes): if variant_ids == [variant['variantId'] for variant in variants_by_gene.get(gene, [])]: continue family_guids = set(gene_variants[0]['familyGuids']) for variant in gene_variants[1:]: family_guids = family_guids.intersection(set(variant['familyGuids'])) invalid_family_guids = set() for family_guid in family_guids: for individual_guid in family_unaffected_individual_guids[family_guid]: # To be compound het all unaffected individuals need to be hom ref for at least one of the variants is_family_compound_het = any( variant['genotypes'].get(individual_guid, {}).get('numAlt') != 1 for variant in gene_variants) if not is_family_compound_het: invalid_family_guids.add(family_guid) break family_guids -= invalid_family_guids if not family_guids: continue for variant in gene_variants: variant['familyGuids'] = list(family_guids) variants_by_gene[gene_id] = gene_variants total_compound_het_results = sum(len(variants) for variants in variants_by_gene.values()) logger.info('Total compound het hits: {}'.format(total_compound_het_results)) return [{k: v} for k, v in variants_by_gene.items()], total_compound_het_results def _parse_hit(self, raw_hit): hit = {k: raw_hit[k] for k in QUERY_FIELD_NAMES if k in raw_hit} index_name = raw_hit.meta.index index_family_samples = self.samples_by_family_index[index_name] if hasattr(raw_hit.meta, 'matched_queries'): family_guids = list(raw_hit.meta.matched_queries) else: # Searches for all inheritance and all families do not filter on inheritance so there are no matched_queries alt_allele_samples = set() for alt_samples_field in HAS_ALT_FIELD_KEYS: alt_allele_samples.update(hit[alt_samples_field]) family_guids = [family_guid for family_guid, samples_by_id in index_family_samples.items() if any(sample_id in alt_allele_samples for sample_id in samples_by_id.keys())] genotypes = {} for family_guid in family_guids: samples_by_id = index_family_samples[family_guid] genotypes.update({ samples_by_id[genotype_hit['sample_id']].individual.guid: _get_field_values(genotype_hit, GENOTYPE_FIELDS_CONFIG) for genotype_hit in hit[GENOTYPES_FIELD_KEY] if genotype_hit['sample_id'] in samples_by_id }) genome_version = self.index_metadata[index_name].get('genomeVersion') lifted_over_genome_version = None lifted_over_chrom = None lifted_over_pos = None liftover_grch38_to_grch37 = _liftover_grch38_to_grch37() if liftover_grch38_to_grch37 and genome_version == GENOME_VERSION_GRCh38: if liftover_grch38_to_grch37: grch37_coord = liftover_grch38_to_grch37.convert_coordinate( 'chr{}'.format(hit['contig'].lstrip('chr')), int(hit['start']) ) if grch37_coord and grch37_coord[0]: lifted_over_genome_version = GENOME_VERSION_GRCh37 lifted_over_chrom = grch37_coord[0][0].lstrip('chr') lifted_over_pos = grch37_coord[0][1] populations = { population: _get_field_values( hit, POPULATION_RESPONSE_FIELD_CONFIGS, format_response_key=lambda key: key.lower(), lookup_field_prefix=population, existing_fields=self.index_metadata[index_name]['fields'], get_addl_fields=lambda field, field_config: [pop_config.get(field)] + ['{}_{}'.format(population, custom_field) for custom_field in field_config.get('fields', [])], ) for population, pop_config in POPULATIONS.items() } sorted_transcripts = [ {_to_camel_case(k): v for k, v in transcript.to_dict().items()} for transcript in hit[SORTED_TRANSCRIPTS_FIELD_KEY] or [] ] transcripts = defaultdict(list) for transcript in sorted_transcripts: transcripts[transcript['geneId']].append(transcript) result = _get_field_values(hit, CORE_FIELDS_CONFIG, format_response_key=str) result.update({ field_name: _get_field_values(hit, fields, lookup_field_prefix=field_name) for field_name, fields in NESTED_FIELDS.items() }) if hasattr(raw_hit.meta, 'sort'): result['_sort'] = [_parse_es_sort(sort, self._sort[i]) for i, sort in enumerate(raw_hit.meta.sort)] result.update({ 'familyGuids': sorted(family_guids), 'genotypes': genotypes, 'genomeVersion': genome_version, 'liftedOverGenomeVersion': lifted_over_genome_version, 'liftedOverChrom': lifted_over_chrom, 'liftedOverPos': lifted_over_pos, 'mainTranscript': sorted_transcripts[0] if len(sorted_transcripts) else {}, 'populations': populations, 'predictions': _get_field_values( hit, PREDICTION_FIELDS_CONFIG, format_response_key=lambda key: key.split('_')[1].lower() ), 'transcripts': transcripts, }) return result def _deduplicate_results(self, sorted_new_results): duplicates = self.previous_search_results.get('duplicate_doc_count', 0) variant_results = [] for variant in sorted_new_results: if variant_results and variant_results[-1]['variantId'] == variant['variantId']: variant_results[-1]['genotypes'].update(variant['genotypes']) variant_results[-1]['familyGuids'] = sorted(set(variant_results[-1]['familyGuids'] + variant['familyGuids'])) duplicates += 1 else: variant_results.append(variant) self.previous_search_results['duplicate_doc_count'] = duplicates self.previous_search_results['total_results'] -= duplicates return variant_results def _deduplicate_compound_het_results(self, compound_het_results): duplicates = 0 results = {} for variant_group in compound_het_results: gene = variant_group.keys()[0] variants = variant_group[gene] if gene in results: for variant in variants: existing_index = next( (i for i, existing in enumerate(results[gene]) if existing['variantId'] == variant['variantId']), None, ) if existing_index is not None: results[gene][existing_index]['genotypes'].update(variant['genotypes']) results[gene][existing_index]['familyGuids'] = sorted( results[gene][existing_index]['familyGuids'] + variant['familyGuids'] ) duplicates += 1 else: results[gene].append(variant) else: results[gene] = variants self.previous_search_results['duplicate_doc_count'] = duplicates + self.previous_search_results.get('duplicate_doc_count', 0) self.previous_search_results['total_results'] -= duplicates return [{k: v} for k, v in results.items()] def _process_compound_hets(self, compound_het_results, variant_results, num_results): if not self.previous_search_results.get('grouped_results'): self.previous_search_results['grouped_results'] = [] # Sort merged result sets grouped_variants = [{None: [var]} for var in variant_results] grouped_variants = compound_het_results + grouped_variants grouped_variants = _sort_compound_hets(grouped_variants) loaded_result_count = sum(len(vars.values()[0]) for vars in grouped_variants + self.previous_search_results['grouped_results']) # Get requested page of variants flattened_variant_results = [] num_compound_hets = 0 num_single_variants = 0 for variants_group in grouped_variants: variants = variants_group.values()[0] flattened_variant_results += variants if loaded_result_count != self.previous_search_results['total_results']: self.previous_search_results['grouped_results'].append(variants_group) if len(variants) > 1: num_compound_hets += 1 else: num_single_variants += 1 if len(flattened_variant_results) >= num_results: break # Only save non-returned results separately if have not loaded all results if loaded_result_count == self.previous_search_results['total_results']: self.previous_search_results['grouped_results'] += grouped_variants self.previous_search_results['compound_het_results'] = [] self.previous_search_results['variant_results'] = [] else: self.previous_search_results['compound_het_results'] = compound_het_results[num_compound_hets:] self.previous_search_results['variant_results'] = variant_results[num_single_variants:] return flattened_variant_results def _delete_long_running_tasks(self): search_tasks = self._client.tasks.list(actions='*search', group_by='parents') canceled = 0 for parent_id, task in search_tasks['tasks'].items(): if task['running_time_in_nanos'] > 10 ** 11: canceled += 1 self._client.tasks.cancel(parent_task_id=parent_id) return canceled
def _build_query(self): query = Q() source = ['id'] sort = [] aggregations = {} query_string = None as_list = as_dict = False for action, value in self.steps: if action == 'order_by': for key in value: if key.startswith('-'): sort.append({key[1:]: 'desc'}) else: sort.append(key) elif action == 'values': source.extend(value) as_list, as_dict = True, False elif action == 'values_dict': if value: source.extend(value) as_list, as_dict = False, True elif action == 'query': query &= self._process_queries(value) elif action == 'filter': query &= self._process_filters(value) elif action == 'source': source.extend(value) elif action == 'aggregate': aggregations.update(value) elif action == 'filter_query_string': query_string = value else: raise NotImplementedError(action) # If we have a raw query string we are going to apply all sorts # of boosts and filters to improve relevance scoring. # # We are using the same rules that `search.filters:SearchQueryFilter` # implements to have a single-source of truth for how our # scoring works. from olympia.search.filters import SearchQueryFilter search = Search().query(query) if query_string: search = SearchQueryFilter().apply_search_query( query_string, search) if sort: search = search.sort(*sort) if source: search = search.source(source) body = search.to_dict() # These are manually added for now to simplify a partial port to # elasticsearch-dsl if self.start: body['from'] = self.start if self.stop is not None: body['size'] = self.stop - self.start if aggregations: body['aggs'] = aggregations self.source, self.as_list, self.as_dict = source, as_list, as_dict return body
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ( isinstance(val, basestring) and ' ' not in val ): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = { 'gt': param.value } elif param.operator == '<': # lower than filter_type = 'range' filter_value = { 'lt': param.value } elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = { 'gte': param.value } elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = { 'lte': param.value } elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = ( operator_wildcards[param.operator] % param.value ) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError( 'Operator %s is not supported' % param.operator ) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't restrict on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot return it' % value ) if not field_['is_returned']: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't sort on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot sort on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't facet on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot facet on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' search.aggs.bucket( value, 'terms', field=field_name, size=self.config.facets_max_number ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=( '_results_number cannot be greater ' 'than 1,000' ) ) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative' ) elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000' ) for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations( params, search, facets_size, histogram_intervals ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error )[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass raise
def calc(result): #xx = datetime.datetime.utcnow() #print 'x: ', xx #result['level1']['start'] = datetime.datetime.now().strftime("%B %d %Y, %X") #result['level1']['start'] = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z") pue = dict() # Constants #pue['N1'] = 1000 / 1000 #pue['N2'] = 1000 / 1000 #pue['N3'] = 710 / 1000 #pue['N4'] = 1700 / 1000 ##pue['N6'] = 0 #pue['N8'] = 500 / 1000 #pue['N9'] = 1600 / 1000 pue['N1'] = 1000 pue['N2'] = 1000 pue['N3'] = 710 pue['N4'] = 1700 ##pue['N6'] = 0 pue['N8'] = 500 pue['N9'] = 1600 result['level1']['start'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S") result['level2']['start'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S") for x in i: indx = x + '*' ##print 'index', indx for eskey in i[x]: ##timespan = i[x][eskey] ##print 'key', eskey ##print 'value', i[x][eskey] (valueField, scale, variable, source) = i[x][eskey].split('|') if variable not in pue: pue[variable] = 0 #print("clears pue") k = eskey.split('|') s = Search(using=esdb, index=indx) for j in k: (subkey, subvalue) = j.split(':') s = s.query("term", **{subkey: subvalue}) ##print 'subkey', subkey ##print 'subvalue', subvalue ##s = s.query('range', **{'@timestamp':{'gte': '2018-07-01T00:00:00.000Z', 'lt':'2018-08-01T00:00:00.000Z'}}) s = s.query('range', **{'@timestamp':{'gte': 'now-30m', 'lt':'now'}}) s = s.sort('-@timestamp') #s = s.aggs.metric('power_sum', 'sum', field=valueField) s = s[0:1] #print s.to_dict() response = s.execute() #print 'Total %d hits found.' % response.hits.total if response.hits.total != 0: for commit in response: # print commit.to_dict() pue[variable] += commit['data']['datum'] * float(scale) # ##print commit.to_dict() # for n in k: # (sk, sv) = n.split(':') # if sk.find('.') != -1: # (psk, ssk) = sk.split('.') # ##print 'key: ', psk # ##print 'ha', commit[psk][ssk] # ##else: # ##print 'key: ', sk # ##print 'value: ', sv # ##print 'ha', commit[sk] # v = response.aggregations.power_sum # pue[variable] += ( v['value'] / response.hits.total ) # print("Processing %s" % variable) else: ##print s.to_dict() if result['level1'].has_key('missing') is False: result['level1']['missing'] = [variable] result['level2']['missing'] = [variable] else: result['level1']['missing'].append(variable) result['level2']['missing'].append(variable) if result['level2'].has_key('missing-meters') is False: result['level1']['missing-meters'] = [source] result['level2']['missing-meters'] = [source] else: result['level1']['missing-meters'].append(source) result['level2']['missing-meters'].append(source) #print 'No Value for: ', variable, ' ', source pue['N7'] = pue['N7p'] - pue['N7pp'] pue['N10pp'] = pue['N10p'] - pue['N10'] pue['D'] = pue['D1'] + pue['D2'] pue['E'] = pue['E1'] + pue['E2'] pue['F'] = pue['F1'] + pue['F2'] if (pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2']) == 0: lineLoss = 0 else: lineLoss = (pue['A1'] + pue['A2']) / (pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2']) if pue['Bp'] == 0: txLoss590 = 0 else: txLoss590 = (pue['B1'] + pue['B2']) / pue['Bp'] if pue['Cp'] == 0: txLoss596 = 0 else: txLoss596 = (pue['C1'] + pue['C2']) / pue['Cp'] #numm1 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 ) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss #demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp'] numm1 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp'] numm2 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N7'] + pue['N6'] + pue['N8'] + pue['N9'] + pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue['ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue['ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue['ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue['ND2-16'] + pue['ND2-17'] + pue['ND2-18'] ) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss demon2 = pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue['ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue['ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue['ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue['ND2-16'] + pue['ND2-17'] + pue['ND2-18'] + pue['Dp'] + pue['Ep'] + pue['Fp'] if demon1 == 0: p1 = 0 else: p1 = numm1 / demon1 if demon2 == 0: p2 = 0 else: p2 = numm2 / demon2 result['level1']['pue'] = p1 result['level2']['pue'] = p2 result['level1']['end'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S") result['level2']['end'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] self._build_fields() # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError('_results_number too large') elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: if not value: continue field_name = self.get_field_name(value) search.aggs.bucket( value, 'terms', field=field_name, size=facets_size, ) # Create signature aggregations. if params.get('_aggs.signature'): sig_bucket = A( 'terms', field=self.get_field_name('signature'), size=facets_size, ) for param in params['_aggs.signature']: for value in param.value: if not value: continue if value.startswith('_histogram.'): # This is a histogram aggregation we want to run, # not a terms aggregation. field_name = value[len('_histogram.'):] if field_name not in self.histogram_fields: continue histogram_type = ( self.all_fields[field_name]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) sig_bucket.bucket( 'histogram_%s' % field_name, histogram_type, field=self.get_field_name(field_name), interval=histogram_intervals[field_name], ) else: sig_bucket.bucket( value, 'terms', field=self.get_field_name(value), size=facets_size, ) search.aggs.bucket('signature', sig_bucket) # Create histograms. for f in self.histogram_fields: if params.get('_histogram.%s' % f): histogram_type = ( self.all_fields[f]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) date_bucket = A( histogram_type, field=self.get_field_name(f), interval=histogram_intervals[f], ) for param in params['_histogram.%s' % f]: for value in param.value: if not value: continue field_name = self.get_field_name(value) val_bucket = A( 'terms', field=field_name, size=facets_size, ) date_bucket.bucket(value, val_bucket) search.aggs.bucket('histogram_%s' % f, date_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): """ Perform a search in Elasticsearch. Args: tokens (unicode): string of one or more words repo_slug (unicode): repository slug sort_by (string): field to sort by terms: (dict): {"vocabulary name": ["term1" [, "term2"]]} Returns: results (SearchResults) """ if terms is None: terms = {} search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) # Limit returned fields since content_xml can be huge and is unnecessary. search = search.fields(_get_field_names()) if tokens is not None: # Search on title, description, and content_xml (minus markup). multi = query.MultiMatch( query=tokens, fields=["title", "description", "content_stripped"]) search = search.query(multi) # Filter further on taxonomy terms. for key, value in terms.items(): if value is None: search = search.query( "query_string", query="_missing_:({key})".format(key=key) ) else: search = search.query("match", **{key: value}) if repo_slug is not None: # Filter further on repository. search = search.query("match", repository=repo_slug) if sort_by is None: # Always sort by ID to preserve ordering. search = search.sort("id") else: # Temporary workaround; the values in sorting.py should be updated, # but for now Haystack is still using them. Also, the hyphen is # required because we sort the numeric values high to low. if "title" not in sort_by: reverse = sort_by.startswith("-") if reverse: sort_by = sort_by[1:] if "xa" not in sort_by: sort_by = "xa_{0}".format(sort_by) if reverse: sort_by = "-{0}".format(sort_by) # Always sort by ID to preserve ordering. search = search.sort(sort_by, "id") vocab_ids = set(get_vocab_ids(repo_slug=repo_slug)) for vocab_id in vocab_ids: vocab_key = make_vocab_key(vocab_id) search.aggs.bucket( "{key}_missing".format(key=vocab_key), "missing", field=vocab_key ) search.aggs.bucket( "{key}_buckets".format(key=vocab_key), "terms", field=vocab_key ) for key in ('run', 'course', 'resource_type'): search.aggs.bucket( '{key}_builtins'.format(key=key), "terms", field=key ) return SearchResults(search)