def test_simple_query(self, mock_Elasticsearch, mock_Search): """:class:`.index.search` supports :class:`SimpleQuery`.""" mock_results = mock.MagicMock() mock_results.__getitem__.return_value = {'total': 53} mock_result = mock.MagicMock() mock_result.meta.score = 1 mock_results.__iter__.return_value = [mock_result] mock_Search.execute.return_value = mock_results # Support the chaining API for py-ES. mock_Search.return_value = mock_Search mock_Search.filter.return_value = mock_Search mock_Search.highlight.return_value = mock_Search mock_Search.highlight_options.return_value = mock_Search mock_Search.query.return_value = mock_Search mock_Search.sort.return_value = mock_Search mock_Search.__getitem__.return_value = mock_Search query = SimpleQuery( order='relevance', page_size=10, search_field='title', value='foo title' ) document_set = index.search(query) self.assertIsInstance(document_set, DocumentSet) self.assertEqual(document_set.metadata['start'], 0) self.assertEqual(document_set.metadata['total'], 53) self.assertEqual(document_set.metadata['current_page'], 1) self.assertEqual(document_set.metadata['total_pages'], 6) self.assertEqual(document_set.metadata['page_size'], 10) self.assertEqual(len(document_set.results), 1)
def search(params: MultiDict) -> Tuple[Dict[str, Any], int, Dict[str, Any]]: """ Handle a search request from the API. Parameters ---------- params : :class:`MultiDict` GET query parameters from the request. Returns ------- dict Response data (to serialize). int HTTP status code. dict Extra headers for the response. """ q = APIQuery() query_terms: List[Dict[str, Any]] = [] terms = _get_fielded_terms(params, query_terms) if terms is not None: q.terms = terms date_range = _get_date_params(params, query_terms) if date_range is not None: q.date_range = date_range primary = params.get('primary_classification') if primary: primary_classification = _get_classification(primary, 'primary_classification', query_terms) q.primary_classification = primary_classification secondaries = params.getlist('secondary_classification') if secondaries: q.secondary_classification = [ _get_classification(sec, 'secondary_classification', query_terms) for sec in secondaries ] include_fields = _get_include_fields(params, query_terms) if include_fields: q.include_fields += include_fields q = paginate(q, params) # type: ignore document_set = index.search(q, highlight=False) document_set.metadata['query'] = query_terms logger.debug('Got document set with %i results', len(document_set.results)) return {'results': document_set, 'query': q}, status.HTTP_200_OK, {}
def health_check() -> Tuple[str, int, Dict[str, Any]]: """ Exercise the connection with the search index with a real query. Returns ------- dict Search result response data. int HTTP status code. dict Headers to add to the response. """ try: documentset = index.search( SimpleQuery( # type: ignore search_field='all', value='theory')) except Exception as e: return 'DOWN', status.HTTP_500_INTERNAL_SERVER_ERROR, {} if documentset.results: return 'OK', status.HTTP_200_OK, {} return 'DOWN', status.HTTP_500_INTERNAL_SERVER_ERROR, {}
def test_advanced_query(self, mock_Elasticsearch, mock_Search): """:class:`.index.search` supports :class:`AdvancedQuery`.""" mock_results = mock.MagicMock() mock_results.__getitem__.return_value = {'total': 53} mock_result = mock.MagicMock() mock_result.meta.score = 1 mock_results.__iter__.return_value = [mock_result] mock_Search.execute.return_value = mock_results # Support the chaining API for py-ES. mock_Search.return_value = mock_Search mock_Search.filter.return_value = mock_Search mock_Search.highlight.return_value = mock_Search mock_Search.highlight_options.return_value = mock_Search mock_Search.query.return_value = mock_Search mock_Search.sort.return_value = mock_Search mock_Search.__getitem__.return_value = mock_Search query = AdvancedQuery( order='relevance', page_size=10, date_range=DateRange( start_date=datetime.now() - timedelta(days=5), end_date=datetime.now() ), primary_classification=ClassificationList([ Classification( group='physics', archive='physics', category='hep-th' ) ]), terms=FieldedSearchList([ FieldedSearchTerm(operator='AND', field='title', term='foo'), FieldedSearchTerm(operator='AND', field='author', term='joe'), FieldedSearchTerm(operator='OR', field='abstract', term='hmm'), FieldedSearchTerm(operator='NOT', field='comments', term='eh'), FieldedSearchTerm(operator='AND', field='journal_ref', term='jref (1999) 1:2-3'), FieldedSearchTerm(operator='AND', field='acm_class', term='abc123'), FieldedSearchTerm(operator='AND', field='msc_class', term='abc123'), FieldedSearchTerm(operator='OR', field='report_num', term='abc123'), FieldedSearchTerm(operator='OR', field='doi', term='10.01234/56789'), FieldedSearchTerm(operator='OR', field='orcid', term='0000-0000-0000-0000'), FieldedSearchTerm(operator='OR', field='author_id', term='Bloggs_J'), ]) ) document_set = index.search(query) self.assertIsInstance(document_set, DocumentSet) self.assertEqual(document_set.metadata['start'], 0) self.assertEqual(document_set.metadata['total'], 53) self.assertEqual(document_set.metadata['current_page'], 1) self.assertEqual(document_set.metadata['total_pages'], 6) self.assertEqual(document_set.metadata['page_size'], 10) self.assertEqual(len(document_set.results), 1)
def search(request_params: MultiDict, archives: Optional[List[str]] = None) -> Response: """ Perform a simple search. This supports requests from both the form-based view (provided here) AND from the mini search widget displayed on all arXiv.org pages. At a minimum, expects the parameter ``value`` in the GET request. This may be a match value for a search query, or an arXiv ID. Parameters ---------- request_params : :class:`.MultiDict` archives : list A list of archives within which the search should be performed. Returns ------- dict Search result response data. int HTTP status code. dict Headers to add to the response. Raises ------ :class:`.InternalServerError` Raised when there is a problem communicating with ES, or there was an unexpected problem executing the query. """ if archives is not None and len(archives) == 0: raise NotFound('No such archive') # We may need to intervene on the request parameters, so we'll # reinstantiate as a mutable MultiDict. if isinstance(request_params, ImmutableMultiDict): request_params = MultiDict(request_params.items(multi=True)) logger.debug('simple search form') response_data = {} # type: Dict[str, Any] logger.debug('simple search request') if 'query' in request_params: try: # first check if the URL includes an arXiv ID arxiv_id: Optional[str] = identifier.parse_arxiv_id( request_params['query']) # If so, redirect. logger.debug(f"got arXiv ID: {arxiv_id}") except ValueError as e: logger.debug('No arXiv ID detected; fall back to form') arxiv_id = None else: arxiv_id = None if arxiv_id: headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)} return {}, status.HTTP_301_MOVED_PERMANENTLY, headers # Here we intervene on the user's query to look for holdouts from the # classic search system's author indexing syntax (surname_f). We # rewrite with a comma, and show a warning to the user about the # change. response_data['has_classic_format'] = False if 'searchtype' in request_params and 'query' in request_params: if request_params['searchtype'] in ['author', 'all']: _query, _classic = catch_underscore_syntax(request_params['query']) response_data['has_classic_format'] = _classic request_params['query'] = _query # Fall back to form-based search. form = SimpleSearchForm(request_params) if form.query.data: # Temporary workaround to support classic help search if form.searchtype.data == 'help': return {}, status.HTTP_301_MOVED_PERMANENTLY,\ {'Location': f'/help/search?q={form.query.data}'} # Support classic "expeirmental" search elif form.searchtype.data == 'full_text': return {}, status.HTTP_301_MOVED_PERMANENTLY,\ {'Location': 'http://search.arxiv.org:8081/' f'?in=&query={form.query.data}'} q: Optional[Query] if form.validate(): logger.debug('form is valid') q = _query_from_form(form) if archives is not None: q = _update_with_archives(q, archives) # Pagination is handled outside of the form. q = paginate(q, request_params) try: # Execute the search. We'll use the results directly in # template rendering, so they get added directly to the # response content. response_data.update(asdict(index.search(q))) except index.IndexConnectionError as e: # There was a (hopefully transient) connection problem. Either # this will clear up relatively quickly (next request), or # there is a more serious outage. logger.error('IndexConnectionError: %s', e) raise InternalServerError( "There was a problem connecting to the search index. This is " "quite likely a transient issue, so please try your search " "again. If this problem persists, please report it to " "[email protected].") from e except index.QueryError as e: # Base exception routers should pick this up and show bug page. logger.error('QueryError: %s', e) raise InternalServerError( "There was a problem executing your query. Please try your " "search again. If this problem persists, please report it to " "[email protected].") from e except index.OutsideAllowedRange as e: raise BadRequest( "Hello clever friend. You can't get results in that range" " right now.") from e except Exception as e: logger.error('Unhandled exception: %s', str(e)) raise else: logger.debug('form is invalid: %s', str(form.errors)) if 'order' in form.errors or 'size' in form.errors: # It's likely that the user tried to set these parameters manually, # or that the search originated from somewhere else (and was # configured incorrectly). simple_url = url_for('ui.search') raise BadRequest( f"It looks like there's something odd about your search" f" request. Please try <a href='{simple_url}'>starting" f" over</a>.") q = None response_data['query'] = q response_data['form'] = form return response_data, status.HTTP_200_OK, {}
def search(request_params: MultiDict) -> Response: """ Perform a search from the advanced search interface. This is intended to support ONLY form-based search, to replace the classic advanced search view. Parameters ---------- request_params : dict Returns ------- dict Response content. int HTTP status code. dict Extra headers to add to the response. Raises ------ InternalServerError Raised when there is an unrecoverable error while interacting with the search index. """ # We may need to intervene on the request parameters, so we'll # reinstantiate as a mutable MultiDict. if isinstance(request_params, ImmutableMultiDict): request_params = MultiDict(request_params.items(multi=True)) logger.debug('search request from advanced form') response_data: Dict[str, Any] = {} response_data['show_form'] = ('advanced' not in request_params) logger.debug('show_form: %s', str(response_data['show_form'])) # Here we intervene on the user's query to look for holdouts from # the classic search system's author indexing syntax (surname_f). We # rewrite with a comma, and show a warning to the user about the # change. has_classic = False for key in request_params.keys(): if key.startswith('terms-') and key.endswith('-term'): value = request_params.get(key) i = re.search('terms-([0-9])+-term', key).group(1) field = request_params.get(f'terms-{i}-field') # We are only looking for this syntax in the author search, or # in an all-fields search. if field not in ['all', 'author']: continue value, _has_classic = catch_underscore_syntax(value) has_classic = _has_classic if not has_classic else has_classic request_params.setlist(key, [value]) response_data['has_classic_format'] = has_classic form = forms.AdvancedSearchForm(request_params) q: Optional[Query] # We want to avoid attempting to validate if no query has been entered. # If a query was actually submitted via the form, 'advanced' will be # present in the request parameters. if 'advanced' in request_params: if form.validate(): logger.debug('form is valid') q = _query_from_form(form) # Pagination is handled outside of the form. q = paginate(q, request_params) try: # Execute the search. We'll use the results directly in # template rendering, so they get added directly to the # response content. response_data.update(asdict(index.search(q))) except index.IndexConnectionError as e: # There was a (hopefully transient) connection problem. Either # this will clear up relatively quickly (next request), or # there is a more serious outage. logger.error('IndexConnectionError: %s', e) raise InternalServerError( "There was a problem connecting to the search index. This " "is quite likely a transient issue, so please try your " "search again. If this problem persists, please report it " "to [email protected].") from e except index.QueryError as e: # Base exception routers should pick this up and show bug page. logger.error('QueryError: %s', e) raise InternalServerError( "There was a problem executing your query. Please try " "your search again. If this problem persists, please " "report it to [email protected].") from e except index.OutsideAllowedRange as e: raise BadRequest( "Hello clever friend. You can't get results in that range" " right now.") from e response_data['query'] = q else: logger.debug('form is invalid: %s', str(form.errors)) if 'order' in form.errors or 'size' in form.errors: # It's likely that the user tried to set these parameters # manually, or that the search originated from somewhere else # (and was configured incorrectly). advanced_url = url_for('ui.advanced_search') raise BadRequest( f"It looks like there's something odd about your search" f" request. Please try <a href='{advanced_url}'>starting" f" over</a>.") # Force the form to be displayed, so that we can render errors. # This has most likely occurred due to someone manually crafting # a GET response, but it could be something else. response_data['show_form'] = True # We want the form handy even when it is not shown to the user. For # example, we can generate new form-friendly requests to update sort # order and page size by embedding the form (hidden). response_data['form'] = form return response_data, status.HTTP_200_OK, {}