def search(outputpath: str, query: Optional[str] = None, since: Optional[datetime.date] = None, until: Optional[datetime.date] = None, limit: Optional[int] = None, limit_per_database: Optional[int] = None, databases: Optional[List[str]] = None, publication_types: Optional[List[str]] = None, scopus_api_token: Optional[str] = None, ieee_api_token: Optional[str] = None, proxy: Optional[str] = None, verbose: Optional[bool] = False): """ When you have a query and needs to get papers using it, this is the method that you'll need to call. This method will find papers from some databases based on the provided query. Parameters ---------- outputpath : str A valid file path where the search result file will be placed query : str, optional A query string that will be used to perform the papers search. If not provided, the query will be loaded from the environment variable FINDPAPERS_QUERY All the query terms need to be enclosed in quotes and can be associated using boolean operators, and grouped using parentheses. E.g.: [term A] AND ([term B] OR [term C]) AND NOT [term D] You can use some wildcards in the query too. Use ? to replace a single character or * to replace any number of characters. E.g.: "son?" -> will match song, sons, ... E.g.: "son*" -> will match song, sons, sonar, songwriting, ... Note: All boolean operators needs to be uppercased. The boolean operator "NOT" must be preceded by an "AND" operator. since : Optional[datetime.date], optional A lower bound (inclusive) date that will be used to filter the search results, by default None until : Optional[datetime.date], optional A upper bound (inclusive) date that will be used to filter the search results, by default None limit : Optional[int], optional The max number of papers to collect, by default None limit_per_database : Optional[int], optional The max number of papers to collect per each database, by default None databases : List[str], optional List of databases where the search should be performed, if not specified all databases will be used, by default None publication_types : List[str], optional List of publication list of publication types to filter when searching, if not specified all the publication types will be collected (this parameter is case insensitive). The available publication types are: journal, conference proceedings, book, other, by default None scopus_api_token : Optional[str], optional A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None ieee_api_token : Optional[str], optional A API token used to fetch data from IEEE database. If you don't have one go to https://developer.ieee.org and get it, by default None proxy : Optional[str], optional proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None verbose : Optional[bool], optional If you wanna a verbose logging """ common_util.logging_initialize(verbose) if proxy is not None: os.environ['FINDPAPERS_PROXY'] = proxy logging.info('Let\'s find some papers, this process may take a while...') if databases is not None: databases = [x.lower() for x in databases] if publication_types is not None: publication_types = [x.lower().strip() for x in publication_types] for publication_type in publication_types: if publication_type not in ['journal', 'conference proceedings', 'book', 'other']: raise ValueError(f'Invalid publication type: {publication_type}') if query is None: query = os.getenv('FINDPAPERS_QUERY') if query is not None: query = _sanitize_query(query) if query is None or not _is_query_ok(query): raise ValueError('Invalid query format') common_util.check_write_access(outputpath) if ieee_api_token is None: ieee_api_token = os.getenv('FINDPAPERS_IEEE_API_TOKEN') if scopus_api_token is None: scopus_api_token = os.getenv('FINDPAPERS_SCOPUS_API_TOKEN') search = Search(query, since, until, limit, limit_per_database, databases=databases, publication_types=publication_types) if databases is None or arxiv_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: arxiv_searcher.run(search), search, arxiv_searcher.DATABASE_LABEL) if databases is None or pubmed_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: pubmed_searcher.run(search), search, pubmed_searcher.DATABASE_LABEL) if databases is None or acm_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: acm_searcher.run(search), search, acm_searcher.DATABASE_LABEL) if ieee_api_token is not None: if databases is None or ieee_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: ieee_searcher.run( search, ieee_api_token), search, ieee_searcher.DATABASE_LABEL) else: logging.info('IEEE API token not found, skipping search on this database') if scopus_api_token is not None: if databases is None or scopus_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: scopus_searcher.run( search, scopus_api_token), search, scopus_searcher.DATABASE_LABEL) else: logging.info('Scopus API token not found, skipping search on this database') if databases is None or medrxiv_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: medrxiv_searcher.run(search), search, medrxiv_searcher.DATABASE_LABEL) if databases is None or biorxiv_searcher.DATABASE_LABEL.lower() in databases: _database_safe_run(lambda: biorxiv_searcher.run(search), search, biorxiv_searcher.DATABASE_LABEL) logging.info('Enriching results...') _enrich(search, scopus_api_token) logging.info('Filtering results...') _filter(search) logging.info('Finding and merging duplications...') search.merge_duplications() logging.info('Flagging potentially predatory publications...') _flag_potentially_predatory_publications(search) logging.info(f'It\'s finally over! {len(search.papers)} papers retrieved. Good luck with your research :)') persistence_util.save(search, outputpath)
def search(): return Search('"this" AND ("that thing" OR "something") AND NOT "anything"', datetime.date(1969, 1, 30), datetime.date(2020, 12, 31), 100, 100)
def test_search(paper: Paper): paper.doi = None search = Search('this AND that', datetime.date(1969, 1, 30), datetime.date(1970, 4, 8), 2) assert len(search.papers) == 0 search.add_paper(paper) assert len(search.papers) == 1 search.add_paper(paper) assert len(search.papers) == 1 another_paper = Paper('awesome paper title 2', 'a long abstract', paper.authors, paper.publication, paper.publication_date, paper.urls) another_paper.add_database('arXiv') search.add_paper(another_paper) assert len(search.papers) == 2 assert paper == search.get_paper(paper.title, paper.publication_date, paper.doi) assert paper.publication == search.get_publication(paper.publication.title, paper.publication.issn, paper.publication.isbn) search.remove_paper(another_paper) assert len(search.papers) == 1 assert paper in search.papers search.limit_per_database = 1 with pytest.raises(OverflowError): search.add_paper(another_paper) search.limit_per_database = 2 search.add_paper(another_paper) assert len(search.papers) == 2 another_paper_2 = copy.deepcopy(paper) another_paper_2.title = 'awesome paper title 3' another_paper_2.abstract = 'a long abstract' another_paper_2.databases = set() with pytest.raises(ValueError): search.add_paper(another_paper_2) another_paper_2.add_database('arXiv') with pytest.raises(OverflowError): search.add_paper(another_paper_2) search.merge_duplications() assert len(search.papers) == 1 publication_title = 'FAKE-TITLE' publication_issn = 'FAKE-ISSN' publication_isbn = 'FAKE-ISBN' assert search.get_publication_key( publication_title, publication_issn, publication_isbn) == f'ISBN-{publication_isbn.lower()}' assert search.get_publication_key( publication_title, publication_issn) == f'ISSN-{publication_issn.lower()}' assert search.get_publication_key( publication_title) == f'TITLE-{publication_title.lower()}'