def get_es_db_missing_pids(self, doc_type, with_deleted=False): """Get ES and DB counts.""" endpoint = current_app.config.get( 'RECORDS_REST_ENDPOINTS' ).get(doc_type, {}) index = endpoint.get('search_index') pids_es_double = [] pids_es = [] pids_db = [] if index and doc_type not in self.has_no_db: date = datetime.utcnow() - timedelta(minutes=self.time_delta) pids_es = {} es_query = RecordsSearch(index=index) \ .filter('range', _created={'lte': date}) for hit in es_query.source('pid').scan(): if pids_es.get(hit.pid): pids_es_double.append(hit.pid) pids_es[hit.pid] = 1 pids_db = [] for pid in self.get_all_pids( doc_type, with_deleted=with_deleted, date=date ): if pids_es.get(pid): pids_es.pop(pid) else: pids_db.append(pid) pids_es = [v for v in pids_es] return pids_es, pids_db, pids_es_double, index
def search(query, index=None, filters=list(), size=10, include="*", exclude="authors", offset=0, sort_field=None, sort_order='', post_filter=None): """ Perform a search query. :param query: [string] query string e.g. 'higgs boson' :param index: [string] name of the index. If None a default is used :param filters: [list of tuples] list of filters for the query. Currently supported: ('author', author_fullname), ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) :param sort_by: [string] sorting field. Currently supported fields: "title", "collaboration", "date", "relevance" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' :return: [dict] dictionary with processed results and facets """ # If empty query then sort by date if query == '' and not sort_field: sort_field = 'date' query = HEPDataQueryParser.parse_query(query) # Create search with preference param to ensure consistency of results across shards search = RecordsSearch(using=es, index=index).with_preference_param() if query: fuzzy_query = QueryString(query=query, fuzziness='AUTO') search.query = fuzzy_query | \ Q('nested', query=fuzzy_query, path='authors') | \ Q('has_child', type="child_datatable", query=fuzzy_query) search = search.filter("term", doc_type=CFG_PUB_TYPE) search = QueryBuilder.add_filters(search, filters) mapped_sort_field = sort_fields_mapping(sort_field) search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}}) search = add_default_aggregations(search, filters) if post_filter: search = search.post_filter(post_filter) search = search.source(includes=include, excludes=exclude) search = search[offset:offset+size] pub_result = search.execute().to_dict() parent_filter = { "terms": { "_id": [hit["_id"] for hit in pub_result['hits']['hits']] } } data_search = RecordsSearch(using=es, index=index) data_search = data_search.query('has_parent', parent_type="parent_publication", query=parent_filter) if query: data_search = data_search.query(QueryString(query=query)) data_search = data_search[0:size*50] data_result = data_search.execute().to_dict() merged_results = merge_results(pub_result, data_result) return map_result(merged_results, filters)
def search(query, index=None, filters=list(), size=10, include="*", exclude="authors", offset=0, sort_field=None, sort_order='', post_filter=None): """ Perform a search query. :param query: [string] query string e.g. 'higgs boson' :param index: [string] name of the index. If None a default is used :param filters: [list of tuples] list of filters for the query. Currently supported: ('author', author_fullname), ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) :param sort_by: [string] sorting field. Currently supported fields: "title", "collaboration", "date", "relevance" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' :return: [dict] dictionary with processed results and facets """ # If empty query then sort by date if query == '' and not sort_field: sort_field = 'date' query = HEPDataQueryParser.parse_query(query) # Create search with preference param to ensure consistency of results across shards search = RecordsSearch(using=es, index=index).with_preference_param() if query: fuzzy_query = QueryString(query=query, fuzziness='AUTO') search.query = fuzzy_query | \ Q('nested', query=fuzzy_query, path='authors') | \ Q('has_child', type="child_datatable", query=fuzzy_query) search = search.filter("term", doc_type=CFG_PUB_TYPE) search = QueryBuilder.add_filters(search, filters) mapped_sort_field = sort_fields_mapping(sort_field) search = search.sort({ mapped_sort_field: { "order": calculate_sort_order(sort_order, sort_field) } }) search = add_default_aggregations(search, filters) if post_filter: search = search.post_filter(post_filter) search = search.source(includes=include, excludes=exclude) search = search[offset:offset + size] try: pub_result = search.execute().to_dict() parent_filter = { "terms": { "_id": [hit["_id"] for hit in pub_result['hits']['hits']] } } data_search = RecordsSearch(using=es, index=index) data_search = data_search.query('has_parent', parent_type="parent_publication", query=parent_filter) if query: data_search = data_search.query(QueryString(query=query)) data_search_size = size * ELASTICSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE data_search = data_search[0:data_search_size] data_result = data_search.execute().to_dict() merged_results = merge_results(pub_result, data_result) return map_result(merged_results, filters) except TransportError as e: # For search phase execution exceptions we pass the reason as it's # likely to be user error (e.g. invalid search query) if e.error == 'search_phase_execution_exception' and e.info \ and "error" in e.info and isinstance(e.info['error'], dict): reason = e.info['error']['root_cause'][0]['reason'] # Otherwise we hide the details from the user else: log.error(f'An unexpected error occurred when searching: {e}') reason = f'An unexpected error occurred: {e.error}' return {'error': reason}