def get_scan_generic_format(client, now, last_fetch_timestamp=None): """Gets a scan object in generic format""" # if method is simple date - convert the date string to datetime es = client.es time_field = client.time_field fetch_index = client.fetch_index if not fetch_index: fetch_index = '_all' if time_field: query = QueryString(query=time_field + ':*') range_field = { time_field: { 'gt': last_fetch_timestamp, 'lte': now } } if last_fetch_timestamp else { time_field: { 'lte': now } } search = Search(using=es, index=fetch_index).filter({ 'range': range_field }).query(query) else: search = Search(using=es, index=fetch_index).query( QueryString(query=client.query)) return search
def make_query(query, filters, page, sort_by): try: client = Elasticsearch() s = Search(client, index=app.config['INDEX']) if query: s = s.query(QueryString(query=escape_query(query))) if not sort_by: sort_by = "relevance" else: s = s.query(MatchAll()) if not sort_by: sort_by = DEFAULT_SORT_BY s = s.sort(SORT_BY.get(sort_by, DEFAULT_SORT_BY)['value']) start = (page - 1) * 20 end = start + 20 s = s[start:end] if filters: s = s.filter('bool', must=filters) result = s.execute() return result except ConnectionError, ex: return None
def get_scan_insight_format(client, now, last_fetch_timestamp=None, feed_type=None): """Gets a scan object in insight format""" time_field = client.time_field range_field = { time_field: { 'gt': last_fetch_timestamp, 'lte': now } } if last_fetch_timestamp else { time_field: { 'lte': now } } es = client.es query = QueryString(query=time_field + ":*") indices = client.fetch_index if feed_type == FEED_TYPE_CORTEX_MT: indices = '*-shared*' tenant_hash = demisto.getIndexHash() if tenant_hash: # all shared indexes minus this tenant shared indices += f',-*{tenant_hash}*-shared*' elif not indices: indices = '_all' search = Search(using=es, index=indices).filter({ 'range': range_field }).query(query) return search
def get_entities_by_sport_and_query(self, sport, query): search = Search(using=self.es) search = search[0:5] if sport == Sport.SOCCER: search = search.index('soccer-entity') elif sport == Sport.BASKETBALL: search = search.index('basketball-entity') if query: query = '*{}*'.format(query) search = search.query( QueryString(query=query, fields=['name^5', 'abstract'])) hits = [] for hit in search.execute(): id = hit.meta['id'] hit = hit.to_dict() entity = {'id': id, 'name': hit['name']} if 'abstract' in hit: entity['abstract'] = hit['abstract'] else: entity['abstract'] = 'None' if 'type' in hit: entity['type'] = hit['type'] else: entity['abstract'] = 'None' hits.append(entity) return hits
def test_time_field_query(es): """Test executing query of fetch time field. Notes: if is_fetch is ticked, this function checks if the entered TIME_FIELD returns results. Args: es(Elasticsearch): an Elasticsearch object to which we run the test. Returns: (dict).The results of the query if they are returned. """ query = QueryString(query=TIME_FIELD + ':*') search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) if total_results == 0: # failed in getting the TIME_FIELD return_error( "Fetch incidents test failed.\nDate field value incorrect [{}].". format(TIME_FIELD)) else: return response
def test_fetch_query(es): """Test executing fetch query. Notes: if is_fetch is ticked, this function checks if the FETCH_QUERY returns results. Args: es(Elasticsearch): an Elasticsearch object to which we run the test. Returns: (dict).The results of the query if they are returned. """ query = QueryString(query=str(TIME_FIELD) + ":* AND " + FETCH_QUERY) search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) if total_results > 0: return response else: # failed to get the TIME_FIELD with the FETCH_QUERY # this can happen and not be an error if the FETCH_QUERY doesn't have results yet. # Thus this does not return an error message return None
def fetch_incidents(): last_fetch, last_fetch_timestamp = get_last_fetch_time() es = elasticsearch_builder() query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*") # Elastic search can use epoch timestamps (in milliseconds) as date representation regardless of date format. search = Search(using=es, index=FETCH_INDEX).filter( {'range': { TIME_FIELD: { 'gt': last_fetch_timestamp } }}) search = search.sort({TIME_FIELD: { 'order': 'asc' }})[0:FETCH_SIZE].query(query) response = search.execute().to_dict() _, total_results = get_total_results(response) incidents = [] # type: List if total_results > 0: if 'Timestamp' in TIME_METHOD: incidents, last_fetch = results_to_incidents_timestamp( response, last_fetch) demisto.setLastRun({'time': last_fetch}) else: incidents, last_fetch = results_to_incidents_datetime( response, last_fetch) demisto.setLastRun({'time': str(last_fetch)}) demisto.info('extract {} incidents'.format(len(incidents))) demisto.incidents(incidents)
def fetch_incidents(proxies): last_run = demisto.getLastRun() last_fetch = last_run.get('time') # handle first time fetch if last_fetch is None: last_fetch, _ = parse_date_range(date_range=FETCH_TIME, date_format='%Y-%m-%dT%H:%M:%S.%f', utc=False, to_timestamp=False) last_fetch = parse(str(last_fetch)) last_fetch_timestamp = int(last_fetch.timestamp() * 1000) # if timestamp: get the last fetch to the correct format of timestamp if 'Timestamp' in TIME_METHOD: last_fetch = get_timestamp_first_fetch(last_fetch) last_fetch_timestamp = last_fetch # if method is simple date - convert the date string to datetime elif 'Simple-Date' == TIME_METHOD: last_fetch = parse(str(last_fetch)) last_fetch_timestamp = int(last_fetch.timestamp() * 1000) # if last_fetch is set and we are in a "Timestamp" method - than the last_fetch_timestamp is the last_fetch. else: last_fetch_timestamp = last_fetch es = elasticsearch_builder(proxies) query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*") # Elastic search can use epoch timestamps (in milliseconds) as date representation regardless of date format. search = Search(using=es, index=FETCH_INDEX).filter( {'range': { TIME_FIELD: { 'gt': last_fetch_timestamp } }}) search = search.sort({TIME_FIELD: { 'order': 'asc' }})[0:FETCH_SIZE].query(query) response = search.execute().to_dict() _, total_results = get_total_results(response) incidents = [] # type: List if total_results > 0: if 'Timestamp' in TIME_METHOD: incidents, last_fetch = results_to_incidents_timestamp( response, last_fetch) demisto.setLastRun({'time': last_fetch}) else: incidents, last_fetch = results_to_incidents_datetime( response, last_fetch) demisto.setLastRun({'time': str(last_fetch)}) demisto.info('extract {} incidents'.format(len(incidents))) demisto.incidents(incidents)
def test_general_query(es): try: query = QueryString(query='*') search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) except NotFoundError as e: return_error("Fetch incidents test failed.\nError message: {}.".format( str(e).split(',')[2][2:-1]))
def index(): """Query Elasticsearch using Invenio query syntax.""" page = request.values.get('page', 1, type=int) size = request.values.get('size', 2, type=int) search = ExampleSearch()[(page - 1) * size:page * size] if 'q' in request.values: search = search.query(QueryString(query=request.values.get('q'))) search = search.sort(request.values.get('sort', 'title')) search = ExampleSearch.faceted_search(search=search) return jsonify(search.execute().to_dict())
def test_cernopendata_query_parser(): assert cernopendata_query_parser('/Btau') == Bool(must=[QueryString(query='"/Btau"')], must_not=[Match(distribution__availability__keyword='ondemand')]) assert cernopendata_query_parser('"/Btau"') == Bool(must=[QueryString(query='"/Btau"')], must_not=[Match(distribution__availability__keyword='ondemand')]) assert cernopendata_query_parser('/btau AND CMS') == Bool(must=[QueryString(query='"/btau" AND CMS')], must_not=[Match(distribution__availability__keyword='ondemand')]) assert cernopendata_query_parser('"/btau" AND CMS') == Bool(must=[QueryString(query='"/btau" AND CMS')], must_not=[Match(distribution__availability__keyword='ondemand')]) assert cernopendata_query_parser('CMS AND /btau') == Bool(must=[QueryString(query='CMS AND "/btau"')], must_not=[Match(distribution__availability__keyword='ondemand')]) assert cernopendata_query_parser('CMS AND /btau', show_ondemand='true') == QueryString(query='CMS AND "/btau"')
def fetch_incidents(): last_run = demisto.getLastRun() last_fetch = last_run.get('time') # handle first time fetch if last_fetch is None: last_fetch, _ = parse_date_range(date_range=FETCH_TIME, date_format=TIME_FORMAT, utc=False, to_timestamp=False) last_fetch = datetime.strptime(str(last_fetch), TIME_FORMAT) # if timestamp: get the last fetch to the correct format of timestamp if 'Timestamp' in TIME_METHOD: last_fetch = get_timestamp_first_fetch(last_fetch) # if method is simple date - convert the date string to datetime elif 'Simple-Date' == TIME_METHOD: last_fetch = datetime.strptime(last_fetch, TIME_FORMAT) es = elasticsearch_builder() query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*") search = Search(using=es, index=FETCH_INDEX).filter( {'range': { TIME_FIELD: { 'gt': last_fetch } }}) search = search.sort({TIME_FIELD: { 'order': 'asc' }})[0:FETCH_SIZE].query(query) response = search.execute().to_dict() _, total_results = get_total_results(response) incidents = [] # type: List if total_results > 0: if 'Timestamp' in TIME_METHOD: incidents, last_fetch = results_to_incidents_timestamp( response, last_fetch) demisto.setLastRun({'time': last_fetch}) else: incidents, last_fetch = results_to_incidents_datetime( response, last_fetch) demisto.setLastRun( {'time': datetime.strftime(last_fetch, TIME_FORMAT)}) demisto.info('extract {} incidents'.format(len(incidents))) demisto.incidents(incidents)
def test_fetch_query(es): query = QueryString(query=str(TIME_FIELD) + ":* AND " + FETCH_QUERY) search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) if total_results > 0: return response else: # failed to get the TIME_FIELD with the FETCH_QUERY # this can happen and not be an error if the FETCH_QUERY doesn't have results yet. # Thus this does not return an error message return None
def test_time_field_query(es): query = QueryString(query=TIME_FIELD + ':*') search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) if total_results == 0: # failed in getting the TIME_FIELD return_error( "Fetch incidents test failed.\nDate field value incorrect [{}].". format(TIME_FIELD)) else: return response
def collection_records(collection=None): collections = Collection.query.filter(Collection.name.in_( [collection])).one().drilldown_tree() query_array = get_collections_queries(collections) query_string = ' or '.join(query_array) search = RecordsSearch().params(version=True).query( QueryString(query=query_string)) response = search.execute().to_dict() recs = json_v1.serialize_search(cap_record_fetcher, response) records = {'records': recs} return recs
def search_command(proxies): """Performs a search in Elasticsearch.""" index = demisto.args().get('index') query = demisto.args().get('query') fields = demisto.args().get('fields') # fields to display explain = 'true' == demisto.args().get('explain') base_page = int(demisto.args().get('page')) size = int(demisto.args().get('size')) sort_field = demisto.args().get('sort-field') sort_order = demisto.args().get('sort-order') es = elasticsearch_builder(proxies) que = QueryString(query=query) search = Search(using=es, index=index).query(que)[base_page:base_page + size] if explain: # if 'explain parameter is set to 'true' - adds explanation section to search results search = search.extra(explain=True) if fields is not None: fields = fields.split(',') search = search.source(fields) if sort_field is not None: search = search.sort({sort_field: {'order': sort_order}}) response = search.execute().to_dict() total_dict, total_results = get_total_results(response) search_context, meta_headers, hit_tables, hit_headers = results_to_context( index, query, base_page, size, total_dict, response) search_human_readable = tableToMarkdown('Search Metadata:', search_context, meta_headers, removeNull=True) hits_human_readable = tableToMarkdown('Hits:', hit_tables, hit_headers, removeNull=True) total_human_readable = search_human_readable + '\n' + hits_human_readable full_context = { 'Elasticsearch.Search(val.Query == obj.Query && val.Index == obj.Index ' '&& val.Server == obj.Server && val.Page == obj.Page && val.Size == obj.Size)': search_context } return_outputs(total_human_readable, full_context, response)
def test_general_query(es): """Test executing query to all available indexes. Args: es(Elasticsearch): an Elasticsearch object to which we run the test. """ try: query = QueryString(query='*') search = Search(using=es, index='*').query(query)[0:1] response = search.execute().to_dict() get_total_results(response) except NotFoundError as e: return_error( "Failed executing general search command - please check the Server URL and port number " "and the supplied credentials.\nError message: {}.".format(str(e)))
def test_general_query(es): """Test executing query in fetch index. Notes: if is_fetch it ticked, this function runs a generay query to Elasticsearch just to make sure we get a response from the FETCH_INDEX. Args: es(Elasticsearch): an Elasticsearch object to which we run the test. """ try: query = QueryString(query='*') search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) except NotFoundError as e: return_error("Fetch incidents test failed.\nError message: {}.".format(str(e).split(',')[2][2:-1]))
def get(self, request): q = request.GET.get('q', None) if not q: return Response("Search query not set", status=status.HTTP_400_BAD_REQUEST) q = f'*{q.strip()[:-1]}*' query = QueryString(query=q, fields=[ 'name', 'base_name', 'generic_names', 'active_ingredients' ]) drugs = DrugDocument.search().query(query) serializer = DrugDocumentSerializer(drugs, many=True) return Response(serializer.data, status=status.HTTP_200_OK)
def get(self, request): q = request.GET.get('q', None) if not q: return Response("Search query not set", status=status.HTTP_400_BAD_REQUEST) q = f'{q.strip()[:-1]}*' query = QueryString(query=q, fields=[ 'name', 'specializations', 'degrees', 'associations', 'fellowships', 'diplomates', 'insurance_providers', 'medical_institutions', 'addresses' ]) doctors = DoctorDocument.search().query(query) serializer = DoctorDocumentSerializer(doctors, many=True) return Response(serializer.data, status=status.HTTP_200_OK)
def get_indicators_search_scan(): now = datetime.now() time_field = "calculatedTime" last_fetch = demisto.getLastRun().get('time') range_field = { time_field: { 'gt': datetime.fromtimestamp(float(last_fetch)), 'lte': now } } if last_fetch else { time_field: { 'lte': now } } es = elasticsearch_builder() query = QueryString(query=time_field + ":*") tenant_hash = demisto.getIndexHash() # all shared indexes minus this tenant shared indexes = f'*-shared*,-*{tenant_hash}*-shared*' search = Search(using=es, index=indexes).filter({ 'range': range_field }).query(query) return search, str(now.timestamp())
def search(query, index=None, filters=list(), size=10, include="*", exclude="authors", offset=0, sort_field=None, sort_order='', post_filter=None): """ Perform a search query. :param query: [string] query string e.g. 'higgs boson' :param index: [string] name of the index. If None a default is used :param filters: [list of tuples] list of filters for the query. Currently supported: ('author', author_fullname), ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) :param sort_by: [string] sorting field. Currently supported fields: "title", "collaboration", "date", "relevance" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' :return: [dict] dictionary with processed results and facets """ # If empty query then sort by date if query == '' and not sort_field: sort_field = 'date' query = HEPDataQueryParser.parse_query(query) # Create search with preference param to ensure consistency of results across shards search = RecordsSearch(using=es, index=index).with_preference_param() if query: fuzzy_query = QueryString(query=query, fuzziness='AUTO') search.query = fuzzy_query | \ Q('nested', query=fuzzy_query, path='authors') | \ Q('has_child', type="child_datatable", query=fuzzy_query) search = search.filter("term", doc_type=CFG_PUB_TYPE) search = QueryBuilder.add_filters(search, filters) mapped_sort_field = sort_fields_mapping(sort_field) search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}}) search = add_default_aggregations(search, filters) if post_filter: search = search.post_filter(post_filter) search = search.source(includes=include, excludes=exclude) search = search[offset:offset+size] pub_result = search.execute().to_dict() parent_filter = { "terms": { "_id": [hit["_id"] for hit in pub_result['hits']['hits']] } } data_search = RecordsSearch(using=es, index=index) data_search = data_search.query('has_parent', parent_type="parent_publication", query=parent_filter) if query: data_search = data_search.query(QueryString(query=query)) data_search = data_search[0:size*50] data_result = data_search.execute().to_dict() merged_results = merge_results(pub_result, data_result) return map_result(merged_results, filters)
#!/usr/bin/env python import sys from elasticsearch_dsl import Search from elasticsearch_dsl.query import QueryString from elasticsearch import helpers, Elasticsearch from bs4 import BeautifulSoup from celery import group import bipolar from time import sleep import json query = sys.argv[1] es = Elasticsearch() s = Search(using=es, index="fnhttp") qs = QueryString(query="form action") s = s.query(qs) response = s.scan() final_urls = [] for hit in response: url = hit.url soup = BeautifulSoup(hit.content, 'html.parser') inputs = soup.find_all('input') field_names = [] for i in inputs: if i.has_key('name'): field_names.append(i['name']) if field_names == []: pass
def get_collection_manifest(cls, api_root, **query_parameters): objects_query = None manifest_query = None version_range = None added_after_range = None size = int(query_parameters.get('limit')) max_page_size = PAGE_SIZE added_after = query_parameters.get('added_after') sort_by = {'date_added': {'order': 'asc'}} types = query_parameters.get('types') ids = query_parameters.get('ids') versions = query_parameters.get('versions') spec_versions = query_parameters.get('spec_versions') base_page = 0 next_id = 0 objects_query = None manifest_query = None version_range = None added_after_range = None log_debug( f"Request to Get The objects Manifest of Collection: {query_parameters.get('collection_id')} " f"in the Feed Root: {api_root}") if query_parameters is None: query_parameters = {} try: # Create a Query to filter Objects by collection id, types and spec_versions objects_query = f"collection : {query_parameters.get('collection_id')}" if types: types = types.replace(",", " OR ") objects_query = objects_query + f" AND type : ('{types}')" if spec_versions: spec_versions = spec_versions.replace(",", " OR ") objects_query = objects_query + f" AND spec_version : ('{spec_versions}')" objects_query_string = QueryString(query=objects_query, default_operator="and") # Create a Query to filter Manifest by collection id, object id's, versions and added after dates manifest_query = f"collection : {query_parameters.get('collection_id')}" if ids: ids = ids.replace(",", " OR ") manifest_query = manifest_query + f" AND id : ('{ids}')" if added_after: added_after_range = Range( **{'date_added': { 'gt': f'{added_after}' }}) manifests_query_string = QueryString(query=manifest_query, default_operator="and") # Get the intersect of both Objects and Manifest Queries intersected_results = cls.es_client.manifest_intersect( intersect_by='id', objects_index=f'{api_root}-objects', objects_query_string=objects_query_string, manifests_index=f'{api_root}-manifest', manifests_query_string=manifests_query_string, added_after_range=added_after_range) # Version and Paginate The Results if intersected_results: manifest_ids = ",".join(intersected_results).replace( ',', ' OR ') query_string = QueryString(query=f"id:('{manifest_ids}')", default_operator="AND") pre_versioning_results = cls.es_client.scan( index=f'{api_root}-manifest', query_string=query_string) pre_pagination_results = Helper.fetch_objects_by_versions( stix_objects=pre_versioning_results, versions=versions) if -1 < size < max_page_size: results = cls.es_client.search( index=f'{api_root}-manifest', query_string=query_string, search_from=base_page, size=size, sort_by=sort_by) else: results = cls.es_client.search( index=f'{api_root}-manifest', query_string=query_string, search_from=base_page, size=max_page_size, sort_by=sort_by) results = {'objects': pre_pagination_results} else: results = {"objects": []} return results except Exception as e: log_error(e) if query_parameters.get('next'): return EXCEPTIONS.get('NextNotFoundException', {}) else: return EXCEPTIONS.get('CollectionNotFoundException', {})
def prepare(self, params={}, params_whitelist=SEARCH_PARAM_WHITELIST, search_models=SEARCH_MODELS, fields=SEARCH_INCLUDE_FIELDS, fields_nested=SEARCH_NESTED_FIELDS, fields_agg=SEARCH_AGG_FIELDS): """Assemble elasticsearch_dsl.Search object @param params: dict @param params_whitelist: list Accept only these (SEARCH_PARAM_WHITELIST) @param search_models: list Limit to these ES doctypes (SEARCH_MODELS) @param fields: list Retrieve these fields (SEARCH_INCLUDE_FIELDS) @param fields_nested: list See SEARCH_NESTED_FIELDS @param fields_agg: dict See SEARCH_AGG_FIELDS @returns: """ # gather inputs ------------------------------ # self.params is a copy of the params arg as it was passed # to the method. It is used for informational purposes # and is passed to SearchResults. # Sanitize while copying. if params: self.params = { key: sanitize_input(val) for key, val in params.items() } params = deepcopy(self.params) # scrub fields not in whitelist bad_fields = [ key for key in params.keys() if key not in params_whitelist + ['page'] ] for key in bad_fields: params.pop(key) indices = search_models if params.get('models'): indices = ','.join( [DOCSTORE.index_name(model) for model in models]) # field-specific searches embedded in fulltext if params.get('fulltext') and 'creators:' in params['fulltext']: params['creators'] = params.pop('fulltext').replace( 'creators:', '') if params.get('fulltext') and 'persons:' in params['fulltext']: params['persons'] = params.pop('fulltext').replace('persons:', '') s = Search(using=self.conn, index=indices) # only return specified fields s = s.source(fields) # sorting if params.get('sort'): args = params.pop('sort') s = s.sort(*args) if params.get('match_all'): s = s.query('match_all') elif params.get('fulltext'): fulltext = params.pop('fulltext') # MultiMatch chokes on lists if isinstance(fulltext, list) and (len(fulltext) == 1): fulltext = fulltext[0] # fulltext search s = s.query( QueryString( query=fulltext, fields=fields, analyze_wildcard=False, allow_leading_wildcard=False, default_operator='AND', )) elif params.get('creators'): q = Q('bool', must=[ Q('nested', path='creators', query=Q('term', creators__namepart=params.pop('creators'))) ]) s = s.query(q) elif params.get('topics') or params.get('facility'): # SPECIAL CASE FOR DDRPUBLIC TOPICS, FACILITY BROWSE PAGES if params.get('topics'): q = Q('bool', must=[ Q('nested', path='topics', query=Q('term', topics__id=params.pop('topics'))) ]) s = s.query(q) elif params.get('facility'): q = Q('bool', must=[ Q('nested', path='facility', query=Q('term', facility__id=params.pop('facility'))) ]) s = s.query(q) if params.get('parent'): parent = params.pop('parent') if isinstance(parent, list) and (len(parent) == 1): parent = parent[0] if parent: parent = '%s-*' % parent s = s.query("wildcard", id=parent) # filters for key, val in params.items(): if key in fields_nested: # Instead of nested search on topics.id or facility.id # search on denormalized topics_id or facility_id fields. fieldname = '%s_id' % key s = s.filter('term', **{fieldname: val}) ## search for *ALL* the topics (AND) #for term_id in val: # s = s.filter( # Q('bool', # must=[ # Q('nested', # path=key, # query=Q('term', **{'%s.id' % key: term_id}) # ) # ] # ) # ) ## search for *ANY* of the topics (OR) #s = s.query( # Q('bool', # must=[ # Q('nested', # path=key, # query=Q('terms', **{'%s.id' % key: val}) # ) # ] # ) #) elif (key in params_whitelist) and val: s = s.filter('term', **{key: val}) # 'term' search is for single choice, not multiple choice fields(?) # aggregations for fieldname, field in fields_agg.items(): # nested aggregation (Elastic docs: https://goo.gl/xM8fPr) if fieldname == 'topics': s.aggs.bucket('topics', 'nested', path='topics') \ .bucket('topics_ids', 'terms', field='topics.id', size=1000) elif fieldname == 'facility': s.aggs.bucket('facility', 'nested', path='facility') \ .bucket('facility_ids', 'terms', field='facility.id', size=1000) # result: # results.aggregations['topics']['topic_ids']['buckets'] # {u'key': u'69', u'doc_count': 9} # {u'key': u'68', u'doc_count': 2} # {u'key': u'62', u'doc_count': 1} # simple aggregations else: s.aggs.bucket(fieldname, 'terms', field=field) self.s = s
def search(query, index=None, filters=list(), size=10, include="*", exclude="authors", offset=0, sort_field=None, sort_order='', post_filter=None): """ Perform a search query. :param query: [string] query string e.g. 'higgs boson' :param index: [string] name of the index. If None a default is used :param filters: [list of tuples] list of filters for the query. Currently supported: ('author', author_fullname), ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) :param sort_by: [string] sorting field. Currently supported fields: "title", "collaboration", "date", "relevance" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' :return: [dict] dictionary with processed results and facets """ # If empty query then sort by date if query == '' and not sort_field: sort_field = 'date' query = HEPDataQueryParser.parse_query(query) # Create search with preference param to ensure consistency of results across shards search = RecordsSearch(using=es, index=index).with_preference_param() if query: fuzzy_query = QueryString(query=query, fuzziness='AUTO') search.query = fuzzy_query | \ Q('nested', query=fuzzy_query, path='authors') | \ Q('has_child', type="child_datatable", query=fuzzy_query) search = search.filter("term", doc_type=CFG_PUB_TYPE) search = QueryBuilder.add_filters(search, filters) mapped_sort_field = sort_fields_mapping(sort_field) search = search.sort({ mapped_sort_field: { "order": calculate_sort_order(sort_order, sort_field) } }) search = add_default_aggregations(search, filters) if post_filter: search = search.post_filter(post_filter) search = search.source(includes=include, excludes=exclude) search = search[offset:offset + size] try: pub_result = search.execute().to_dict() parent_filter = { "terms": { "_id": [hit["_id"] for hit in pub_result['hits']['hits']] } } data_search = RecordsSearch(using=es, index=index) data_search = data_search.query('has_parent', parent_type="parent_publication", query=parent_filter) if query: data_search = data_search.query(QueryString(query=query)) data_search_size = size * ELASTICSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE data_search = data_search[0:data_search_size] data_result = data_search.execute().to_dict() merged_results = merge_results(pub_result, data_result) return map_result(merged_results, filters) except TransportError as e: # For search phase execution exceptions we pass the reason as it's # likely to be user error (e.g. invalid search query) if e.error == 'search_phase_execution_exception' and e.info \ and "error" in e.info and isinstance(e.info['error'], dict): reason = e.info['error']['root_cause'][0]['reason'] # Otherwise we hide the details from the user else: log.error(f'An unexpected error occurred when searching: {e}') reason = f'An unexpected error occurred: {e.error}' return {'error': reason}