def to_cache(cache_dir: str, arxiv_id: str, docmeta: List[DocMeta]) -> None: """ Add a document to the local cache, if available. Parameters ---------- arxiv_id : str docmeta : :class:`.DocMeta` Raises ------ RuntimeError Raised when the cache is not available, or the document could not be added to the cache. """ if not cache_dir: return fname = "%s.json" % arxiv_id.replace("/", "_") cache_path = os.path.join(cache_dir, fname) try: with open(cache_path, "w") as f: json.dump([asdict(dm) for dm in docmeta], f) except Exception as ex: raise RuntimeError(str(ex)) from ex
def bulk_add_documents(self, documents: List[Document], docs_per_chunk: int = 500) -> None: """ Add documents to the search index using the bulk API. Parameters ---------- document : :class:`.Document` Must be a valid search document, per ``schema/DocumentMetadata.json``. docs_per_chunk: int Number of documents to send to ES in a single chunk Raises ------ IndexConnectionError Problem communicating with Elasticsearch host. BulkIndexingError Problem serializing ``document`` for indexing. """ if not self.es.indices.exists(index=self.index): logger.debug('index does not exist') self.create_index() logger.debug('created index') with handle_es_exceptions(): actions = ({ '_index': self.index, '_type': self.doc_type, '_id': document.id, '_source': asdict(document) } for document in documents) helpers.bulk(client=self.es, actions=actions, chunk_size=docs_per_chunk) logger.debug('added %i documents to index', len(documents))
def populate(print_indexable: bool, paper_id: str, id_list: str, load_cache: bool, cache_dir: str) -> None: """Populate the search index with some test data.""" cache_dir = init_cache(cache_dir) index_count = 0 if paper_id: # Index a single paper. TO_INDEX = [paper_id] elif id_list: # Index a list of papers. TO_INDEX = load_id_list(id_list) else: TO_INDEX = load_id_sample() approx_size = len(TO_INDEX) retrieve_chunk_size = 50 index_chunk_size = 250 chunk: List[str] = [] meta: List[DocMeta] = [] try: with click.progressbar(length=approx_size, label='Papers indexed') as index_bar: last = len(TO_INDEX) - 1 for i, paper_id in enumerate(TO_INDEX): this_meta = [] if load_cache: try: this_meta = from_cache(cache_dir, paper_id) except RuntimeError as e: # No document. pass if this_meta: meta += this_meta else: chunk.append(paper_id) if len(chunk) == retrieve_chunk_size or i == last: try: new_meta = metadata.bulk_retrieve(chunk) except metadata.ConnectionFailed as e: # Try again. new_meta = metadata.bulk_retrieve(chunk) # Add metadata to the cache. key = lambda dm: dm.paper_id new_meta_srt = sorted(new_meta, key=key) for paper_id, grp in groupby(new_meta_srt, key): to_cache(cache_dir, paper_id, [dm for dm in grp]) meta += new_meta chunk = [] # Index papers on a different chunk cycle, and at the very end. if len(meta) >= index_chunk_size or i == last: # Transform to Document. documents = [ transform.to_search_document(dm) for dm in meta ] # Add to index. index.bulk_add_documents(documents) if print_indexable: for document in documents: click.echo(json.dumps(asdict(document))) index_count += len(documents) meta = [] index_bar.update(i) except Exception as e: raise RuntimeError('Populate failed: %s' % str(e)) from e finally: click.echo(f"Indexed {index_count} documents in total") click.echo(f"Cache path: {cache_dir}; use `-c {cache_dir}` to reuse in" f" subsequent calls")
def search(request_params: MultiDict, archives: Optional[List[str]] = None) -> Response: """ Perform a simple search. This supports requests from both the form-based view (provided here) AND from the mini search widget displayed on all arXiv.org pages. At a minimum, expects the parameter ``value`` in the GET request. This may be a match value for a search query, or an arXiv ID. Parameters ---------- request_params : :class:`.MultiDict` archives : list A list of archives within which the search should be performed. Returns ------- dict Search result response data. int HTTP status code. dict Headers to add to the response. Raises ------ :class:`.InternalServerError` Raised when there is a problem communicating with ES, or there was an unexpected problem executing the query. """ if archives is not None and len(archives) == 0: raise NotFound('No such archive') # We may need to intervene on the request parameters, so we'll # reinstantiate as a mutable MultiDict. if isinstance(request_params, ImmutableMultiDict): request_params = MultiDict(request_params.items(multi=True)) logger.debug('simple search form') response_data = {} # type: Dict[str, Any] logger.debug('simple search request') if 'query' in request_params: try: # first check if the URL includes an arXiv ID arxiv_id: Optional[str] = identifier.parse_arxiv_id( request_params['query']) # If so, redirect. logger.debug(f"got arXiv ID: {arxiv_id}") except ValueError as e: logger.debug('No arXiv ID detected; fall back to form') arxiv_id = None else: arxiv_id = None if arxiv_id: headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)} return {}, status.HTTP_301_MOVED_PERMANENTLY, headers # Here we intervene on the user's query to look for holdouts from the # classic search system's author indexing syntax (surname_f). We # rewrite with a comma, and show a warning to the user about the # change. response_data['has_classic_format'] = False if 'searchtype' in request_params and 'query' in request_params: if request_params['searchtype'] in ['author', 'all']: _query, _classic = catch_underscore_syntax(request_params['query']) response_data['has_classic_format'] = _classic request_params['query'] = _query # Fall back to form-based search. form = SimpleSearchForm(request_params) if form.query.data: # Temporary workaround to support classic help search if form.searchtype.data == 'help': return {}, status.HTTP_301_MOVED_PERMANENTLY,\ {'Location': f'/help/search?q={form.query.data}'} # Support classic "expeirmental" search elif form.searchtype.data == 'full_text': return {}, status.HTTP_301_MOVED_PERMANENTLY,\ {'Location': 'http://search.arxiv.org:8081/' f'?in=&query={form.query.data}'} q: Optional[Query] if form.validate(): logger.debug('form is valid') q = _query_from_form(form) if archives is not None: q = _update_with_archives(q, archives) # Pagination is handled outside of the form. q = paginate(q, request_params) try: # Execute the search. We'll use the results directly in # template rendering, so they get added directly to the # response content. response_data.update(asdict(index.search(q))) except index.IndexConnectionError as e: # There was a (hopefully transient) connection problem. Either # this will clear up relatively quickly (next request), or # there is a more serious outage. logger.error('IndexConnectionError: %s', e) raise InternalServerError( "There was a problem connecting to the search index. This is " "quite likely a transient issue, so please try your search " "again. If this problem persists, please report it to " "[email protected].") from e except index.QueryError as e: # Base exception routers should pick this up and show bug page. logger.error('QueryError: %s', e) raise InternalServerError( "There was a problem executing your query. Please try your " "search again. If this problem persists, please report it to " "[email protected].") from e except index.OutsideAllowedRange as e: raise BadRequest( "Hello clever friend. You can't get results in that range" " right now.") from e except Exception as e: logger.error('Unhandled exception: %s', str(e)) raise else: logger.debug('form is invalid: %s', str(form.errors)) if 'order' in form.errors or 'size' in form.errors: # It's likely that the user tried to set these parameters manually, # or that the search originated from somewhere else (and was # configured incorrectly). simple_url = url_for('ui.search') raise BadRequest( f"It looks like there's something odd about your search" f" request. Please try <a href='{simple_url}'>starting" f" over</a>.") q = None response_data['query'] = q response_data['form'] = form return response_data, status.HTTP_200_OK, {}
def search(request_params: MultiDict) -> Response: """ Perform a search from the advanced search interface. This is intended to support ONLY form-based search, to replace the classic advanced search view. Parameters ---------- request_params : dict Returns ------- dict Response content. int HTTP status code. dict Extra headers to add to the response. Raises ------ InternalServerError Raised when there is an unrecoverable error while interacting with the search index. """ # We may need to intervene on the request parameters, so we'll # reinstantiate as a mutable MultiDict. if isinstance(request_params, ImmutableMultiDict): request_params = MultiDict(request_params.items(multi=True)) logger.debug('search request from advanced form') response_data: Dict[str, Any] = {} response_data['show_form'] = ('advanced' not in request_params) logger.debug('show_form: %s', str(response_data['show_form'])) # Here we intervene on the user's query to look for holdouts from # the classic search system's author indexing syntax (surname_f). We # rewrite with a comma, and show a warning to the user about the # change. has_classic = False for key in request_params.keys(): if key.startswith('terms-') and key.endswith('-term'): value = request_params.get(key) i = re.search('terms-([0-9])+-term', key).group(1) field = request_params.get(f'terms-{i}-field') # We are only looking for this syntax in the author search, or # in an all-fields search. if field not in ['all', 'author']: continue value, _has_classic = catch_underscore_syntax(value) has_classic = _has_classic if not has_classic else has_classic request_params.setlist(key, [value]) response_data['has_classic_format'] = has_classic form = forms.AdvancedSearchForm(request_params) q: Optional[Query] # We want to avoid attempting to validate if no query has been entered. # If a query was actually submitted via the form, 'advanced' will be # present in the request parameters. if 'advanced' in request_params: if form.validate(): logger.debug('form is valid') q = _query_from_form(form) # Pagination is handled outside of the form. q = paginate(q, request_params) try: # Execute the search. We'll use the results directly in # template rendering, so they get added directly to the # response content. response_data.update(asdict(index.search(q))) except index.IndexConnectionError as e: # There was a (hopefully transient) connection problem. Either # this will clear up relatively quickly (next request), or # there is a more serious outage. logger.error('IndexConnectionError: %s', e) raise InternalServerError( "There was a problem connecting to the search index. This " "is quite likely a transient issue, so please try your " "search again. If this problem persists, please report it " "to [email protected].") from e except index.QueryError as e: # Base exception routers should pick this up and show bug page. logger.error('QueryError: %s', e) raise InternalServerError( "There was a problem executing your query. Please try " "your search again. If this problem persists, please " "report it to [email protected].") from e except index.OutsideAllowedRange as e: raise BadRequest( "Hello clever friend. You can't get results in that range" " right now.") from e response_data['query'] = q else: logger.debug('form is invalid: %s', str(form.errors)) if 'order' in form.errors or 'size' in form.errors: # It's likely that the user tried to set these parameters # manually, or that the search originated from somewhere else # (and was configured incorrectly). advanced_url = url_for('ui.advanced_search') raise BadRequest( f"It looks like there's something odd about your search" f" request. Please try <a href='{advanced_url}'>starting" f" over</a>.") # Force the form to be displayed, so that we can render errors. # This has most likely occurred due to someone manually crafting # a GET response, but it could be something else. response_data['show_form'] = True # We want the form handy even when it is not shown to the user. For # example, we can generate new form-friendly requests to update sort # order and page size by embedding the form (hidden). response_data['form'] = form return response_data, status.HTTP_200_OK, {}
def populate( print_indexable: bool, paper_id: str, id_list: str, cache_dir: str, no_cache: bool, quiet: bool, ) -> None: """Populate the search index with some test data.""" if cache_dir and no_cache: raise RuntimeError("Cannot set both no cache and cache dir") if not no_cache: cache_dir = init_cache(cache_dir) else: cache_dir = None index_count = 0 if paper_id: # Index a single paper. TO_INDEX = [paper_id] elif id_list: # Index a list of papers. TO_INDEX = load_id_list(id_list) else: TO_INDEX = load_id_sample() approx_size = len(TO_INDEX) retrieve_chunk_size = 50 index_chunk_size = 250 chunk: List[str] = [] meta: List[DocMeta] = [] index.SearchSession.create_index() progress = NoopContextManager() if quiet \ else click.progressbar(length=approx_size) try: with progress as index_bar: last = len(TO_INDEX) - 1 for i, paper_id in enumerate(TO_INDEX): this_meta = [] if cache_dir: this_meta = from_cache(cache_dir, paper_id) if this_meta: meta += this_meta else: chunk.append(paper_id) if len(chunk) == retrieve_chunk_size or (chunk and i == last): try: new_meta = metadata.bulk_retrieve(chunk) except metadata.ConnectionFailed: # Try again. new_meta = metadata.bulk_retrieve(chunk) meta += new_meta chunk = [] if not no_cache: # Add metadata to the cache. new_meta_srt = sorted(new_meta, key=get_paper_id) for paper_id, grp in groupby(new_meta_srt, get_paper_id): to_cache(cache_dir, paper_id, [dm for dm in grp]) # Index papers on a different chunk cycle, and at the very end. if len(meta) >= index_chunk_size or i == last: # Transform to Document. docs = [transform.to_search_document(dm) for dm in meta] # Add to index. index.SearchSession.bulk_add_documents(docs) if print_indexable: for document in docs: click.echo(json.dumps(asdict(docs))) index_count += len(docs) meta = [] index_bar.update(i) finally: if not quiet: click.echo(f"Indexed {index_count} documents in total") if cache_dir: click.echo( f"Cache path: {cache_dir}; use `-c {cache_dir}` to reuse in" f" subsequent calls")