Example #1
0
 def get_es_db_missing_pids(self, doc_type, with_deleted=False):
     """Get ES and DB counts."""
     endpoint = current_app.config.get(
         'RECORDS_REST_ENDPOINTS'
     ).get(doc_type, {})
     index = endpoint.get('search_index')
     pids_es_double = []
     pids_es = []
     pids_db = []
     if index and doc_type not in self.has_no_db:
         date = datetime.utcnow() - timedelta(minutes=self.time_delta)
         pids_es = {}
         es_query = RecordsSearch(index=index) \
             .filter('range', _created={'lte': date})
         for hit in es_query.source('pid').scan():
             if pids_es.get(hit.pid):
                 pids_es_double.append(hit.pid)
             pids_es[hit.pid] = 1
         pids_db = []
         for pid in self.get_all_pids(
             doc_type,
             with_deleted=with_deleted,
             date=date
         ):
             if pids_es.get(pid):
                 pids_es.pop(pid)
             else:
                 pids_db.append(pid)
         pids_es = [v for v in pids_es]
     return pids_es, pids_db, pids_es_double, index
Example #2
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)
    # Create search with preference param to ensure consistency of results across shards
    search = RecordsSearch(using=es, index=index).with_preference_param()

    if query:
        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
        search.query = fuzzy_query | \
                       Q('nested', query=fuzzy_query, path='authors') | \
                       Q('has_child', type="child_datatable", query=fuzzy_query)

    search = search.filter("term", doc_type=CFG_PUB_TYPE)
    search = QueryBuilder.add_filters(search, filters)

    mapped_sort_field = sort_fields_mapping(sort_field)
    search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})
    search = add_default_aggregations(search, filters)

    if post_filter:
        search = search.post_filter(post_filter)

    search = search.source(includes=include, excludes=exclude)
    search = search[offset:offset+size]
    pub_result = search.execute().to_dict()

    parent_filter = {
        "terms": {
                    "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
        }
    }

    data_search = RecordsSearch(using=es, index=index)
    data_search = data_search.query('has_parent',
                                    parent_type="parent_publication",
                                    query=parent_filter)
    if query:
        data_search = data_search.query(QueryString(query=query))

    data_search = data_search[0:size*50]
    data_result = data_search.execute().to_dict()

    merged_results = merge_results(pub_result, data_result)
    return map_result(merged_results, filters)
Example #3
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)
    # Create search with preference param to ensure consistency of results across shards
    search = RecordsSearch(using=es, index=index).with_preference_param()

    if query:
        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
        search.query = fuzzy_query | \
                       Q('nested', query=fuzzy_query, path='authors') | \
                       Q('has_child', type="child_datatable", query=fuzzy_query)

    search = search.filter("term", doc_type=CFG_PUB_TYPE)
    search = QueryBuilder.add_filters(search, filters)

    mapped_sort_field = sort_fields_mapping(sort_field)
    search = search.sort({
        mapped_sort_field: {
            "order": calculate_sort_order(sort_order, sort_field)
        }
    })
    search = add_default_aggregations(search, filters)

    if post_filter:
        search = search.post_filter(post_filter)

    search = search.source(includes=include, excludes=exclude)
    search = search[offset:offset + size]

    try:
        pub_result = search.execute().to_dict()

        parent_filter = {
            "terms": {
                "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
            }
        }

        data_search = RecordsSearch(using=es, index=index)
        data_search = data_search.query('has_parent',
                                        parent_type="parent_publication",
                                        query=parent_filter)
        if query:
            data_search = data_search.query(QueryString(query=query))

        data_search_size = size * ELASTICSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
        data_search = data_search[0:data_search_size]
        data_result = data_search.execute().to_dict()

        merged_results = merge_results(pub_result, data_result)
        return map_result(merged_results, filters)
    except TransportError as e:
        # For search phase execution exceptions we pass the reason as it's
        # likely to be user error (e.g. invalid search query)
        if e.error == 'search_phase_execution_exception' and e.info \
                and "error" in e.info and isinstance(e.info['error'], dict):
            reason = e.info['error']['root_cause'][0]['reason']
        # Otherwise we hide the details from the user
        else:
            log.error(f'An unexpected error occurred when searching: {e}')
            reason = f'An unexpected error occurred: {e.error}'
        return {'error': reason}