Esempio n. 1
0
def to_cache(cache_dir: str, arxiv_id: str, docmeta: List[DocMeta]) -> None:
    """
    Add a document to the local cache, if available.

    Parameters
    ----------
    arxiv_id : str
    docmeta : :class:`.DocMeta`

    Raises
    ------
    RuntimeError
        Raised when the cache is not available, or the document could not
        be added to the cache.

    """
    if not cache_dir:
        return
    fname = "%s.json" % arxiv_id.replace("/", "_")
    cache_path = os.path.join(cache_dir, fname)
    try:
        with open(cache_path, "w") as f:
            json.dump([asdict(dm) for dm in docmeta], f)
    except Exception as ex:
        raise RuntimeError(str(ex)) from ex
Esempio n. 2
0
    def bulk_add_documents(self,
                           documents: List[Document],
                           docs_per_chunk: int = 500) -> None:
        """
        Add documents to the search index using the bulk API.

        Parameters
        ----------
        document : :class:`.Document`
            Must be a valid search document, per
            ``schema/DocumentMetadata.json``.
        docs_per_chunk: int
            Number of documents to send to ES in a single chunk
        Raises
        ------
        IndexConnectionError
            Problem communicating with Elasticsearch host.
        BulkIndexingError
            Problem serializing ``document`` for indexing.

        """
        if not self.es.indices.exists(index=self.index):
            logger.debug('index does not exist')
            self.create_index()
            logger.debug('created index')

        with handle_es_exceptions():
            actions = ({
                '_index': self.index,
                '_type': self.doc_type,
                '_id': document.id,
                '_source': asdict(document)
            } for document in documents)

            helpers.bulk(client=self.es,
                         actions=actions,
                         chunk_size=docs_per_chunk)
            logger.debug('added %i documents to index', len(documents))
Esempio n. 3
0
def populate(print_indexable: bool, paper_id: str, id_list: str,
             load_cache: bool, cache_dir: str) -> None:
    """Populate the search index with some test data."""
    cache_dir = init_cache(cache_dir)
    index_count = 0
    if paper_id:    # Index a single paper.
        TO_INDEX = [paper_id]
    elif id_list:   # Index a list of papers.
        TO_INDEX = load_id_list(id_list)
    else:
        TO_INDEX = load_id_sample()
    approx_size = len(TO_INDEX)

    retrieve_chunk_size = 50
    index_chunk_size = 250
    chunk: List[str] = []
    meta: List[DocMeta] = []

    try:
        with click.progressbar(length=approx_size,
                               label='Papers indexed') as index_bar:
            last = len(TO_INDEX) - 1
            for i, paper_id in enumerate(TO_INDEX):
                this_meta = []
                if load_cache:
                    try:
                        this_meta = from_cache(cache_dir, paper_id)
                    except RuntimeError as e:    # No document.
                        pass

                if this_meta:
                    meta += this_meta
                else:
                    chunk.append(paper_id)

                if len(chunk) == retrieve_chunk_size or i == last:
                    try:
                        new_meta = metadata.bulk_retrieve(chunk)
                    except metadata.ConnectionFailed as e:  # Try again.
                        new_meta = metadata.bulk_retrieve(chunk)
                    # Add metadata to the cache.
                    key = lambda dm: dm.paper_id
                    new_meta_srt = sorted(new_meta, key=key)
                    for paper_id, grp in groupby(new_meta_srt, key):
                        to_cache(cache_dir, paper_id, [dm for dm in grp])
                    meta += new_meta
                    chunk = []

                # Index papers on a different chunk cycle, and at the very end.
                if len(meta) >= index_chunk_size or i == last:
                    # Transform to Document.
                    documents = [
                        transform.to_search_document(dm) for dm in meta
                    ]
                    # Add to index.
                    index.bulk_add_documents(documents)

                    if print_indexable:
                        for document in documents:
                            click.echo(json.dumps(asdict(document)))
                    index_count += len(documents)
                    meta = []
                    index_bar.update(i)

    except Exception as e:
        raise RuntimeError('Populate failed: %s' % str(e)) from e

    finally:
        click.echo(f"Indexed {index_count} documents in total")
        click.echo(f"Cache path: {cache_dir}; use `-c {cache_dir}` to reuse in"
                   f" subsequent calls")
Esempio n. 4
0
def search(request_params: MultiDict,
           archives: Optional[List[str]] = None) -> Response:
    """
    Perform a simple search.

    This supports requests from both the form-based view (provided here) AND
    from the mini search widget displayed on all arXiv.org pages.

    At a minimum, expects the parameter ``value`` in the GET request. This may
    be a match value for a search query, or an arXiv ID.

    Parameters
    ----------
    request_params : :class:`.MultiDict`
    archives : list
        A list of archives within which the search should be performed.

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    :class:`.InternalServerError`
        Raised when there is a problem communicating with ES, or there was an
        unexpected problem executing the query.

    """
    if archives is not None and len(archives) == 0:
        raise NotFound('No such archive')

    # We may need to intervene on the request parameters, so we'll
    # reinstantiate as a mutable MultiDict.
    if isinstance(request_params, ImmutableMultiDict):
        request_params = MultiDict(request_params.items(multi=True))

    logger.debug('simple search form')
    response_data = {}  # type: Dict[str, Any]

    logger.debug('simple search request')
    if 'query' in request_params:
        try:
            # first check if the URL includes an arXiv ID
            arxiv_id: Optional[str] = identifier.parse_arxiv_id(
                request_params['query'])
            # If so, redirect.
            logger.debug(f"got arXiv ID: {arxiv_id}")
        except ValueError as e:
            logger.debug('No arXiv ID detected; fall back to form')
            arxiv_id = None
    else:
        arxiv_id = None

    if arxiv_id:
        headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)}
        return {}, status.HTTP_301_MOVED_PERMANENTLY, headers

    # Here we intervene on the user's query to look for holdouts from the
    # classic search system's author indexing syntax (surname_f). We
    # rewrite with a comma, and show a warning to the user about the
    # change.
    response_data['has_classic_format'] = False
    if 'searchtype' in request_params and 'query' in request_params:
        if request_params['searchtype'] in ['author', 'all']:
            _query, _classic = catch_underscore_syntax(request_params['query'])
            response_data['has_classic_format'] = _classic
            request_params['query'] = _query

    # Fall back to form-based search.
    form = SimpleSearchForm(request_params)

    if form.query.data:
        # Temporary workaround to support classic help search
        if form.searchtype.data == 'help':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': f'/help/search?q={form.query.data}'}

        # Support classic "expeirmental" search
        elif form.searchtype.data == 'full_text':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': 'http://search.arxiv.org:8081/'
                             f'?in=&query={form.query.data}'}

    q: Optional[Query]
    if form.validate():
        logger.debug('form is valid')
        q = _query_from_form(form)

        if archives is not None:
            q = _update_with_archives(q, archives)

        # Pagination is handled outside of the form.
        q = paginate(q, request_params)

        try:
            # Execute the search. We'll use the results directly in
            #  template rendering, so they get added directly to the
            #  response content.
            response_data.update(asdict(index.search(q)))
        except index.IndexConnectionError as e:
            # There was a (hopefully transient) connection problem. Either
            #  this will clear up relatively quickly (next request), or
            #  there is a more serious outage.
            logger.error('IndexConnectionError: %s', e)
            raise InternalServerError(
                "There was a problem connecting to the search index. This is "
                "quite likely a transient issue, so please try your search "
                "again. If this problem persists, please report it to "
                "[email protected].") from e
        except index.QueryError as e:
            # Base exception routers should pick this up and show bug page.
            logger.error('QueryError: %s', e)
            raise InternalServerError(
                "There was a problem executing your query. Please try your "
                "search again.  If this problem persists, please report it to "
                "[email protected].") from e
        except index.OutsideAllowedRange as e:
            raise BadRequest(
                "Hello clever friend. You can't get results in that range"
                " right now.") from e

        except Exception as e:
            logger.error('Unhandled exception: %s', str(e))
            raise
    else:
        logger.debug('form is invalid: %s', str(form.errors))
        if 'order' in form.errors or 'size' in form.errors:
            # It's likely that the user tried to set these parameters manually,
            # or that the search originated from somewhere else (and was
            # configured incorrectly).
            simple_url = url_for('ui.search')
            raise BadRequest(
                f"It looks like there's something odd about your search"
                f" request. Please try <a href='{simple_url}'>starting"
                f" over</a>.")
        q = None
    response_data['query'] = q
    response_data['form'] = form
    return response_data, status.HTTP_200_OK, {}
Esempio n. 5
0
def search(request_params: MultiDict) -> Response:
    """
    Perform a search from the advanced search interface.

    This is intended to support ONLY form-based search, to replace the classic
    advanced search view.

    Parameters
    ----------
    request_params : dict

    Returns
    -------
    dict
        Response content.
    int
        HTTP status code.
    dict
        Extra headers to add to the response.

    Raises
    ------
    InternalServerError
        Raised when there is an unrecoverable error while interacting with the
        search index.

    """
    # We may need to intervene on the request parameters, so we'll
    # reinstantiate as a mutable MultiDict.
    if isinstance(request_params, ImmutableMultiDict):
        request_params = MultiDict(request_params.items(multi=True))

    logger.debug('search request from advanced form')
    response_data: Dict[str, Any] = {}
    response_data['show_form'] = ('advanced' not in request_params)
    logger.debug('show_form: %s', str(response_data['show_form']))

    # Here we intervene on the user's query to look for holdouts from
    # the classic search system's author indexing syntax (surname_f). We
    # rewrite with a comma, and show a warning to the user about the
    # change.
    has_classic = False
    for key in request_params.keys():
        if key.startswith('terms-') and key.endswith('-term'):
            value = request_params.get(key)
            i = re.search('terms-([0-9])+-term', key).group(1)
            field = request_params.get(f'terms-{i}-field')
            # We are only looking for this syntax in the author search, or
            # in an all-fields search.
            if field not in ['all', 'author']:
                continue

            value, _has_classic = catch_underscore_syntax(value)
            has_classic = _has_classic if not has_classic else has_classic
            request_params.setlist(key, [value])

    response_data['has_classic_format'] = has_classic
    form = forms.AdvancedSearchForm(request_params)
    q: Optional[Query]
    # We want to avoid attempting to validate if no query has been entered.
    #  If a query was actually submitted via the form, 'advanced' will be
    #  present in the request parameters.
    if 'advanced' in request_params:

        if form.validate():
            logger.debug('form is valid')
            q = _query_from_form(form)

            # Pagination is handled outside of the form.
            q = paginate(q, request_params)

            try:
                # Execute the search. We'll use the results directly in
                #  template rendering, so they get added directly to the
                #  response content.
                response_data.update(asdict(index.search(q)))
            except index.IndexConnectionError as e:
                # There was a (hopefully transient) connection problem. Either
                #  this will clear up relatively quickly (next request), or
                #  there is a more serious outage.
                logger.error('IndexConnectionError: %s', e)
                raise InternalServerError(
                    "There was a problem connecting to the search index. This "
                    "is quite likely a transient issue, so please try your "
                    "search again. If this problem persists, please report it "
                    "to [email protected].") from e
            except index.QueryError as e:
                # Base exception routers should pick this up and show bug page.
                logger.error('QueryError: %s', e)
                raise InternalServerError(
                    "There was a problem executing your query. Please try "
                    "your search again.  If this problem persists, please "
                    "report it to [email protected].") from e
            except index.OutsideAllowedRange as e:
                raise BadRequest(
                    "Hello clever friend. You can't get results in that range"
                    " right now.") from e
            response_data['query'] = q
        else:
            logger.debug('form is invalid: %s', str(form.errors))
            if 'order' in form.errors or 'size' in form.errors:
                # It's likely that the user tried to set these parameters
                # manually, or that the search originated from somewhere else
                # (and was configured incorrectly).
                advanced_url = url_for('ui.advanced_search')
                raise BadRequest(
                    f"It looks like there's something odd about your search"
                    f" request. Please try <a href='{advanced_url}'>starting"
                    f" over</a>.")

            # Force the form to be displayed, so that we can render errors.
            #  This has most likely occurred due to someone manually crafting
            #  a GET response, but it could be something else.
            response_data['show_form'] = True

    # We want the form handy even when it is not shown to the user. For
    #  example, we can generate new form-friendly requests to update sort
    #  order and page size by embedding the form (hidden).
    response_data['form'] = form
    return response_data, status.HTTP_200_OK, {}
Esempio n. 6
0
def populate(
    print_indexable: bool,
    paper_id: str,
    id_list: str,
    cache_dir: str,
    no_cache: bool,
    quiet: bool,
) -> None:
    """Populate the search index with some test data."""
    if cache_dir and no_cache:
        raise RuntimeError("Cannot set both no cache and cache dir")

    if not no_cache:
        cache_dir = init_cache(cache_dir)
    else:
        cache_dir = None

    index_count = 0
    if paper_id:  # Index a single paper.
        TO_INDEX = [paper_id]
    elif id_list:  # Index a list of papers.
        TO_INDEX = load_id_list(id_list)
    else:
        TO_INDEX = load_id_sample()
    approx_size = len(TO_INDEX)

    retrieve_chunk_size = 50
    index_chunk_size = 250
    chunk: List[str] = []
    meta: List[DocMeta] = []
    index.SearchSession.create_index()
    progress = NoopContextManager() if quiet \
        else click.progressbar(length=approx_size)
    try:
        with progress as index_bar:
            last = len(TO_INDEX) - 1
            for i, paper_id in enumerate(TO_INDEX):
                this_meta = []
                if cache_dir:
                    this_meta = from_cache(cache_dir, paper_id)

                if this_meta:
                    meta += this_meta
                else:
                    chunk.append(paper_id)

                if len(chunk) == retrieve_chunk_size or (chunk and i == last):
                    try:
                        new_meta = metadata.bulk_retrieve(chunk)
                    except metadata.ConnectionFailed:  # Try again.
                        new_meta = metadata.bulk_retrieve(chunk)

                    meta += new_meta
                    chunk = []
                    if not no_cache:
                        # Add metadata to the cache.
                        new_meta_srt = sorted(new_meta, key=get_paper_id)
                        for paper_id, grp in groupby(new_meta_srt,
                                                     get_paper_id):
                            to_cache(cache_dir, paper_id, [dm for dm in grp])

                # Index papers on a different chunk cycle, and at the very end.
                if len(meta) >= index_chunk_size or i == last:
                    # Transform to Document.
                    docs = [transform.to_search_document(dm) for dm in meta]
                    # Add to index.
                    index.SearchSession.bulk_add_documents(docs)

                    if print_indexable:
                        for document in docs:
                            click.echo(json.dumps(asdict(docs)))
                    index_count += len(docs)
                    meta = []
                    index_bar.update(i)
    finally:
        if not quiet:
            click.echo(f"Indexed {index_count} documents in total")
            if cache_dir:
                click.echo(
                    f"Cache path: {cache_dir}; use `-c {cache_dir}` to reuse in"
                    f" subsequent calls")