Exemple #1
0
    def test_simple_query(self, mock_Elasticsearch, mock_Search):
        """:class:`.index.search` supports :class:`SimpleQuery`."""
        mock_results = mock.MagicMock()
        mock_results.__getitem__.return_value = {'total': 53}
        mock_result = mock.MagicMock()
        mock_result.meta.score = 1
        mock_results.__iter__.return_value = [mock_result]
        mock_Search.execute.return_value = mock_results

        # Support the chaining API for py-ES.
        mock_Search.return_value = mock_Search
        mock_Search.filter.return_value = mock_Search
        mock_Search.highlight.return_value = mock_Search
        mock_Search.highlight_options.return_value = mock_Search
        mock_Search.query.return_value = mock_Search
        mock_Search.sort.return_value = mock_Search
        mock_Search.__getitem__.return_value = mock_Search

        query = SimpleQuery(
            order='relevance',
            page_size=10,
            search_field='title',
            value='foo title'
        )
        document_set = index.search(query)
        self.assertIsInstance(document_set, DocumentSet)
        self.assertEqual(document_set.metadata['start'], 0)
        self.assertEqual(document_set.metadata['total'], 53)
        self.assertEqual(document_set.metadata['current_page'], 1)
        self.assertEqual(document_set.metadata['total_pages'], 6)
        self.assertEqual(document_set.metadata['page_size'], 10)
        self.assertEqual(len(document_set.results), 1)
Exemple #2
0
def search(params: MultiDict) -> Tuple[Dict[str, Any], int, Dict[str, Any]]:
    """
    Handle a search request from the API.

    Parameters
    ----------
    params : :class:`MultiDict`
        GET query parameters from the request.

    Returns
    -------
    dict
        Response data (to serialize).
    int
        HTTP status code.
    dict
        Extra headers for the response.
    """
    q = APIQuery()
    query_terms: List[Dict[str, Any]] = []
    terms = _get_fielded_terms(params, query_terms)
    if terms is not None:
        q.terms = terms
    date_range = _get_date_params(params, query_terms)
    if date_range is not None:
        q.date_range = date_range

    primary = params.get('primary_classification')
    if primary:
        primary_classification = _get_classification(primary,
                                                     'primary_classification',
                                                     query_terms)
        q.primary_classification = primary_classification

    secondaries = params.getlist('secondary_classification')
    if secondaries:
        q.secondary_classification = [
            _get_classification(sec, 'secondary_classification', query_terms)
            for sec in secondaries
        ]

    include_fields = _get_include_fields(params, query_terms)
    if include_fields:
        q.include_fields += include_fields

    q = paginate(q, params)  # type: ignore
    document_set = index.search(q, highlight=False)
    document_set.metadata['query'] = query_terms
    logger.debug('Got document set with %i results', len(document_set.results))
    return {'results': document_set, 'query': q}, status.HTTP_200_OK, {}
Exemple #3
0
def health_check() -> Tuple[str, int, Dict[str, Any]]:
    """
    Exercise the connection with the search index with a real query.

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    """
    try:
        documentset = index.search(
            SimpleQuery(  # type: ignore
                search_field='all', value='theory'))
    except Exception as e:
        return 'DOWN', status.HTTP_500_INTERNAL_SERVER_ERROR, {}
    if documentset.results:
        return 'OK', status.HTTP_200_OK, {}
    return 'DOWN', status.HTTP_500_INTERNAL_SERVER_ERROR, {}
Exemple #4
0
    def test_advanced_query(self, mock_Elasticsearch, mock_Search):
        """:class:`.index.search` supports :class:`AdvancedQuery`."""
        mock_results = mock.MagicMock()
        mock_results.__getitem__.return_value = {'total': 53}
        mock_result = mock.MagicMock()
        mock_result.meta.score = 1
        mock_results.__iter__.return_value = [mock_result]
        mock_Search.execute.return_value = mock_results

        # Support the chaining API for py-ES.
        mock_Search.return_value = mock_Search
        mock_Search.filter.return_value = mock_Search
        mock_Search.highlight.return_value = mock_Search
        mock_Search.highlight_options.return_value = mock_Search
        mock_Search.query.return_value = mock_Search
        mock_Search.sort.return_value = mock_Search
        mock_Search.__getitem__.return_value = mock_Search

        query = AdvancedQuery(
            order='relevance',
            page_size=10,
            date_range=DateRange(
                start_date=datetime.now() - timedelta(days=5),
                end_date=datetime.now()
            ),
            primary_classification=ClassificationList([
                Classification(
                    group='physics',
                    archive='physics',
                    category='hep-th'
                )
            ]),
            terms=FieldedSearchList([
                FieldedSearchTerm(operator='AND', field='title', term='foo'),
                FieldedSearchTerm(operator='AND', field='author', term='joe'),
                FieldedSearchTerm(operator='OR', field='abstract', term='hmm'),
                FieldedSearchTerm(operator='NOT', field='comments', term='eh'),
                FieldedSearchTerm(operator='AND', field='journal_ref',
                                  term='jref (1999) 1:2-3'),
                FieldedSearchTerm(operator='AND', field='acm_class',
                                  term='abc123'),
                FieldedSearchTerm(operator='AND', field='msc_class',
                                  term='abc123'),
                FieldedSearchTerm(operator='OR', field='report_num',
                                  term='abc123'),
                FieldedSearchTerm(operator='OR', field='doi',
                                  term='10.01234/56789'),
                FieldedSearchTerm(operator='OR', field='orcid',
                                  term='0000-0000-0000-0000'),
                FieldedSearchTerm(operator='OR', field='author_id',
                                  term='Bloggs_J'),
            ])
        )
        document_set = index.search(query)
        self.assertIsInstance(document_set, DocumentSet)
        self.assertEqual(document_set.metadata['start'], 0)
        self.assertEqual(document_set.metadata['total'], 53)
        self.assertEqual(document_set.metadata['current_page'], 1)
        self.assertEqual(document_set.metadata['total_pages'], 6)
        self.assertEqual(document_set.metadata['page_size'], 10)
        self.assertEqual(len(document_set.results), 1)
Exemple #5
0
def search(request_params: MultiDict,
           archives: Optional[List[str]] = None) -> Response:
    """
    Perform a simple search.

    This supports requests from both the form-based view (provided here) AND
    from the mini search widget displayed on all arXiv.org pages.

    At a minimum, expects the parameter ``value`` in the GET request. This may
    be a match value for a search query, or an arXiv ID.

    Parameters
    ----------
    request_params : :class:`.MultiDict`
    archives : list
        A list of archives within which the search should be performed.

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    :class:`.InternalServerError`
        Raised when there is a problem communicating with ES, or there was an
        unexpected problem executing the query.

    """
    if archives is not None and len(archives) == 0:
        raise NotFound('No such archive')

    # We may need to intervene on the request parameters, so we'll
    # reinstantiate as a mutable MultiDict.
    if isinstance(request_params, ImmutableMultiDict):
        request_params = MultiDict(request_params.items(multi=True))

    logger.debug('simple search form')
    response_data = {}  # type: Dict[str, Any]

    logger.debug('simple search request')
    if 'query' in request_params:
        try:
            # first check if the URL includes an arXiv ID
            arxiv_id: Optional[str] = identifier.parse_arxiv_id(
                request_params['query'])
            # If so, redirect.
            logger.debug(f"got arXiv ID: {arxiv_id}")
        except ValueError as e:
            logger.debug('No arXiv ID detected; fall back to form')
            arxiv_id = None
    else:
        arxiv_id = None

    if arxiv_id:
        headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)}
        return {}, status.HTTP_301_MOVED_PERMANENTLY, headers

    # Here we intervene on the user's query to look for holdouts from the
    # classic search system's author indexing syntax (surname_f). We
    # rewrite with a comma, and show a warning to the user about the
    # change.
    response_data['has_classic_format'] = False
    if 'searchtype' in request_params and 'query' in request_params:
        if request_params['searchtype'] in ['author', 'all']:
            _query, _classic = catch_underscore_syntax(request_params['query'])
            response_data['has_classic_format'] = _classic
            request_params['query'] = _query

    # Fall back to form-based search.
    form = SimpleSearchForm(request_params)

    if form.query.data:
        # Temporary workaround to support classic help search
        if form.searchtype.data == 'help':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': f'/help/search?q={form.query.data}'}

        # Support classic "expeirmental" search
        elif form.searchtype.data == 'full_text':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': 'http://search.arxiv.org:8081/'
                             f'?in=&query={form.query.data}'}

    q: Optional[Query]
    if form.validate():
        logger.debug('form is valid')
        q = _query_from_form(form)

        if archives is not None:
            q = _update_with_archives(q, archives)

        # Pagination is handled outside of the form.
        q = paginate(q, request_params)

        try:
            # Execute the search. We'll use the results directly in
            #  template rendering, so they get added directly to the
            #  response content.
            response_data.update(asdict(index.search(q)))
        except index.IndexConnectionError as e:
            # There was a (hopefully transient) connection problem. Either
            #  this will clear up relatively quickly (next request), or
            #  there is a more serious outage.
            logger.error('IndexConnectionError: %s', e)
            raise InternalServerError(
                "There was a problem connecting to the search index. This is "
                "quite likely a transient issue, so please try your search "
                "again. If this problem persists, please report it to "
                "[email protected].") from e
        except index.QueryError as e:
            # Base exception routers should pick this up and show bug page.
            logger.error('QueryError: %s', e)
            raise InternalServerError(
                "There was a problem executing your query. Please try your "
                "search again.  If this problem persists, please report it to "
                "[email protected].") from e
        except index.OutsideAllowedRange as e:
            raise BadRequest(
                "Hello clever friend. You can't get results in that range"
                " right now.") from e

        except Exception as e:
            logger.error('Unhandled exception: %s', str(e))
            raise
    else:
        logger.debug('form is invalid: %s', str(form.errors))
        if 'order' in form.errors or 'size' in form.errors:
            # It's likely that the user tried to set these parameters manually,
            # or that the search originated from somewhere else (and was
            # configured incorrectly).
            simple_url = url_for('ui.search')
            raise BadRequest(
                f"It looks like there's something odd about your search"
                f" request. Please try <a href='{simple_url}'>starting"
                f" over</a>.")
        q = None
    response_data['query'] = q
    response_data['form'] = form
    return response_data, status.HTTP_200_OK, {}
Exemple #6
0
def search(request_params: MultiDict) -> Response:
    """
    Perform a search from the advanced search interface.

    This is intended to support ONLY form-based search, to replace the classic
    advanced search view.

    Parameters
    ----------
    request_params : dict

    Returns
    -------
    dict
        Response content.
    int
        HTTP status code.
    dict
        Extra headers to add to the response.

    Raises
    ------
    InternalServerError
        Raised when there is an unrecoverable error while interacting with the
        search index.

    """
    # We may need to intervene on the request parameters, so we'll
    # reinstantiate as a mutable MultiDict.
    if isinstance(request_params, ImmutableMultiDict):
        request_params = MultiDict(request_params.items(multi=True))

    logger.debug('search request from advanced form')
    response_data: Dict[str, Any] = {}
    response_data['show_form'] = ('advanced' not in request_params)
    logger.debug('show_form: %s', str(response_data['show_form']))

    # Here we intervene on the user's query to look for holdouts from
    # the classic search system's author indexing syntax (surname_f). We
    # rewrite with a comma, and show a warning to the user about the
    # change.
    has_classic = False
    for key in request_params.keys():
        if key.startswith('terms-') and key.endswith('-term'):
            value = request_params.get(key)
            i = re.search('terms-([0-9])+-term', key).group(1)
            field = request_params.get(f'terms-{i}-field')
            # We are only looking for this syntax in the author search, or
            # in an all-fields search.
            if field not in ['all', 'author']:
                continue

            value, _has_classic = catch_underscore_syntax(value)
            has_classic = _has_classic if not has_classic else has_classic
            request_params.setlist(key, [value])

    response_data['has_classic_format'] = has_classic
    form = forms.AdvancedSearchForm(request_params)
    q: Optional[Query]
    # We want to avoid attempting to validate if no query has been entered.
    #  If a query was actually submitted via the form, 'advanced' will be
    #  present in the request parameters.
    if 'advanced' in request_params:

        if form.validate():
            logger.debug('form is valid')
            q = _query_from_form(form)

            # Pagination is handled outside of the form.
            q = paginate(q, request_params)

            try:
                # Execute the search. We'll use the results directly in
                #  template rendering, so they get added directly to the
                #  response content.
                response_data.update(asdict(index.search(q)))
            except index.IndexConnectionError as e:
                # There was a (hopefully transient) connection problem. Either
                #  this will clear up relatively quickly (next request), or
                #  there is a more serious outage.
                logger.error('IndexConnectionError: %s', e)
                raise InternalServerError(
                    "There was a problem connecting to the search index. This "
                    "is quite likely a transient issue, so please try your "
                    "search again. If this problem persists, please report it "
                    "to [email protected].") from e
            except index.QueryError as e:
                # Base exception routers should pick this up and show bug page.
                logger.error('QueryError: %s', e)
                raise InternalServerError(
                    "There was a problem executing your query. Please try "
                    "your search again.  If this problem persists, please "
                    "report it to [email protected].") from e
            except index.OutsideAllowedRange as e:
                raise BadRequest(
                    "Hello clever friend. You can't get results in that range"
                    " right now.") from e
            response_data['query'] = q
        else:
            logger.debug('form is invalid: %s', str(form.errors))
            if 'order' in form.errors or 'size' in form.errors:
                # It's likely that the user tried to set these parameters
                # manually, or that the search originated from somewhere else
                # (and was configured incorrectly).
                advanced_url = url_for('ui.advanced_search')
                raise BadRequest(
                    f"It looks like there's something odd about your search"
                    f" request. Please try <a href='{advanced_url}'>starting"
                    f" over</a>.")

            # Force the form to be displayed, so that we can render errors.
            #  This has most likely occurred due to someone manually crafting
            #  a GET response, but it could be something else.
            response_data['show_form'] = True

    # We want the form handy even when it is not shown to the user. For
    #  example, we can generate new form-friendly requests to update sort
    #  order and page size by embedding the form (hidden).
    response_data['form'] = form
    return response_data, status.HTTP_200_OK, {}