def search(outputpath: str, query: Optional[str] = None, since: Optional[datetime.date] = None, until: Optional[datetime.date] = None,
        limit: Optional[int] = None, limit_per_database: Optional[int] = None, databases: Optional[List[str]] = None,
        publication_types: Optional[List[str]] = None, scopus_api_token: Optional[str] = None, ieee_api_token: Optional[str] = None,
        proxy: Optional[str] = None, verbose: Optional[bool] = False):
    """
    When you have a query and needs to get papers using it, this is the method that you'll need to call.
    This method will find papers from some databases based on the provided query.

    Parameters
    ----------
    outputpath : str
        A valid file path where the search result file will be placed

    query : str, optional

        A query string that will be used to perform the papers search.
        
        If not provided, the query will be loaded from the environment variable FINDPAPERS_QUERY

        All the query terms need to be enclosed in quotes and can be associated using boolean operators,
        and grouped using parentheses. 
        E.g.: [term A] AND ([term B] OR [term C]) AND NOT [term D]

        You can use some wildcards in the query too. Use ? to replace a single character or * to replace any number of characters. 
        E.g.: "son?" -> will match song, sons, ...
        E.g.: "son*" -> will match song, sons, sonar, songwriting, ...

        Note: All boolean operators needs to be uppercased. The boolean operator "NOT" must be preceded by an "AND" operator.

    since : Optional[datetime.date], optional
        A lower bound (inclusive) date that will be used to filter the search results, by default None

    until : Optional[datetime.date], optional
        A upper bound (inclusive) date that will be used to filter the search results, by default None

    limit : Optional[int], optional
        The max number of papers to collect, by default None

    limit_per_database : Optional[int], optional
        The max number of papers to collect per each database, by default None

    databases : List[str], optional
        List of databases where the search should be performed, if not specified all databases will be used, by default None

    publication_types : List[str], optional
        List of publication list of publication types to filter when searching, if not specified all the publication types 
        will be collected (this parameter is case insensitive). The available publication types are: journal, conference proceedings, book, other, by default None

    scopus_api_token : Optional[str], optional
        A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None

    ieee_api_token : Optional[str], optional
        A API token used to fetch data from IEEE database. If you don't have one go to https://developer.ieee.org and get it, by default None
    
    proxy : Optional[str], optional
        proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None

    verbose : Optional[bool], optional
        If you wanna a verbose logging
    """

    common_util.logging_initialize(verbose)

    if proxy is not None:
        os.environ['FINDPAPERS_PROXY'] = proxy
    
    logging.info('Let\'s find some papers, this process may take a while...')

    if databases is not None:
        databases = [x.lower() for x in databases]
    
    if publication_types is not None:
        publication_types = [x.lower().strip() for x in publication_types]
        for publication_type in publication_types:
            if publication_type not in ['journal', 'conference proceedings', 'book', 'other']:
                raise ValueError(f'Invalid publication type: {publication_type}')

    if query is None:
        query = os.getenv('FINDPAPERS_QUERY')

    if query is not None:
        query = _sanitize_query(query)

    if query is None or not _is_query_ok(query):
        raise ValueError('Invalid query format')

    common_util.check_write_access(outputpath)

    if ieee_api_token is None:
        ieee_api_token = os.getenv('FINDPAPERS_IEEE_API_TOKEN')

    if scopus_api_token is None:
        scopus_api_token = os.getenv('FINDPAPERS_SCOPUS_API_TOKEN')

    search = Search(query, since, until, limit, limit_per_database, databases=databases, publication_types=publication_types)

    if databases is None or arxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: arxiv_searcher.run(search),
                        search, arxiv_searcher.DATABASE_LABEL)
    
    if databases is None or pubmed_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: pubmed_searcher.run(search),
                        search, pubmed_searcher.DATABASE_LABEL)

    if databases is None or acm_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: acm_searcher.run(search),
                        search, acm_searcher.DATABASE_LABEL)

    if ieee_api_token is not None:
        if databases is None or ieee_searcher.DATABASE_LABEL.lower() in databases:
            _database_safe_run(lambda: ieee_searcher.run(
                search, ieee_api_token), search, ieee_searcher.DATABASE_LABEL)
    else:
        logging.info('IEEE API token not found, skipping search on this database')

    if scopus_api_token is not None:
        if databases is None or scopus_searcher.DATABASE_LABEL.lower() in databases:
            _database_safe_run(lambda: scopus_searcher.run(
                search, scopus_api_token), search, scopus_searcher.DATABASE_LABEL)
    else:
        logging.info('Scopus API token not found, skipping search on this database')

    if databases is None or medrxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: medrxiv_searcher.run(search),
                        search, medrxiv_searcher.DATABASE_LABEL)

    if databases is None or biorxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: biorxiv_searcher.run(search),
                        search, biorxiv_searcher.DATABASE_LABEL)

    logging.info('Enriching results...')

    _enrich(search, scopus_api_token)

    logging.info('Filtering results...')

    _filter(search)

    logging.info('Finding and merging duplications...')

    search.merge_duplications()

    logging.info('Flagging potentially predatory publications...')

    _flag_potentially_predatory_publications(search)

    logging.info(f'It\'s finally over! {len(search.papers)} papers retrieved. Good luck with your research :)')

    persistence_util.save(search, outputpath)
Exemple #2
0
def search():
    return Search('"this" AND ("that thing" OR "something") AND NOT "anything"', datetime.date(1969, 1, 30), datetime.date(2020, 12, 31), 100, 100)
def test_search(paper: Paper):

    paper.doi = None

    search = Search('this AND that', datetime.date(1969, 1, 30),
                    datetime.date(1970, 4, 8), 2)

    assert len(search.papers) == 0

    search.add_paper(paper)
    assert len(search.papers) == 1
    search.add_paper(paper)
    assert len(search.papers) == 1

    another_paper = Paper('awesome paper title 2', 'a long abstract',
                          paper.authors, paper.publication,
                          paper.publication_date, paper.urls)
    another_paper.add_database('arXiv')

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    assert paper == search.get_paper(paper.title, paper.publication_date,
                                     paper.doi)
    assert paper.publication == search.get_publication(paper.publication.title,
                                                       paper.publication.issn,
                                                       paper.publication.isbn)

    search.remove_paper(another_paper)
    assert len(search.papers) == 1
    assert paper in search.papers

    search.limit_per_database = 1
    with pytest.raises(OverflowError):
        search.add_paper(another_paper)
    search.limit_per_database = 2

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    another_paper_2 = copy.deepcopy(paper)
    another_paper_2.title = 'awesome paper title 3'
    another_paper_2.abstract = 'a long abstract'
    another_paper_2.databases = set()

    with pytest.raises(ValueError):
        search.add_paper(another_paper_2)

    another_paper_2.add_database('arXiv')

    with pytest.raises(OverflowError):
        search.add_paper(another_paper_2)

    search.merge_duplications()
    assert len(search.papers) == 1

    publication_title = 'FAKE-TITLE'
    publication_issn = 'FAKE-ISSN'
    publication_isbn = 'FAKE-ISBN'
    assert search.get_publication_key(
        publication_title, publication_issn,
        publication_isbn) == f'ISBN-{publication_isbn.lower()}'
    assert search.get_publication_key(
        publication_title,
        publication_issn) == f'ISSN-{publication_issn.lower()}'
    assert search.get_publication_key(
        publication_title) == f'TITLE-{publication_title.lower()}'