Python Search Exemples, findpapers.models.search.Search Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_acm_searcher.py Projet : skjq/findpapers

def test_run(search: Search):

    search.limit = 14
    search.limit_per_database = None

    acm_searcher.run(search)

    assert len(search.papers) == 14

Exemple #2

0

Afficher le fichier

def test_run(search: Search):

    search.limit = 20
    search.limit_per_database = None
    search.since = datetime.date(2020, 8, 26)
    search.until = datetime.date(2020, 8, 26)

    arxiv_searcher.run(search)

    assert len(search.papers) == 18

Exemple #3

0

Afficher le fichier

Fichier : ieee_searcher.py Projet : skjq/findpapers

def run(search: Search, api_token: str):
    """
    This method fetch papers from IEEE database using the provided search parameters
    After fetch the data from IEEE, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')

    papers_count = 0
    result = _get_api_result(search, api_token)
    total_papers = result.get('total_records')

    logging.info(f'IEEE: {total_papers} papers to fetch')

    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        for paper_entry in result.get('articles'):

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break
            
            papers_count += 1

            try:

                logging.info(f'({papers_count}/{total_papers}) Fetching IEEE paper: {paper_entry.get("title")}')

                publication = _get_publication(paper_entry)
                paper = _get_paper(paper_entry, publication)

                if paper is not None:
                    paper.add_database(DATABASE_LABEL)
                    search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            result = _get_api_result(search, api_token, papers_count+1)

Exemple #4

0

Afficher le fichier

Fichier : rxiv_searcher.py Projet : jonatasgrosman/findpapers

def run(search: Search, database: str):
    """
    This method fetch papers from medRxiv/bioRxiv database using the provided search parameters
    After fetch the data from medRxiv/bioRxiv, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    database : str
        The database name (medRxiv or bioRxiv)
    """

    urls = _get_search_urls(search, database)

    for i, url in enumerate(urls):

        if search.reached_its_limit(database):
            break

        logging.info(f'{database}: Requesting for papers...')

        data = _get_data(url)

        total_papers = 0
        if len(data) > 0:
            total_papers = data[0].get('total_papers')

        logging.info(f'{database}: {total_papers} papers to fetch from {i+1}/{len(urls)} papers requests')

        papers_count = 0
        dois = sum([d.get('dois') for d in [x for x in data]], [])

        for doi in dois:
            if papers_count >= total_papers or search.reached_its_limit(database):
                break
            try:
                papers_count += 1
                paper_metadata = _get_paper_metadata(doi, database)

                paper_title = paper_metadata.get('title')
                
                logging.info(f'({papers_count}/{total_papers}) Fetching {database} paper: {paper_title}')
                
                paper = _get_paper(paper_metadata)
                
                paper.add_database(database)

                search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

Exemple #5

0

Afficher le fichier

Fichier : search_runner_tool.py Projet : jonatasgrosman/findpapers

def _filter(search: Search):
    """
    Private method that filter the search results

    Parameters
    ----------
    search : Search
        A search instance
    """

    if search.publication_types is not None:
        for paper in list(search.papers):
            try:
                if (paper.publication is not None and paper.publication.category.lower() not in search.publication_types) or \
                    (paper.publication is None and 'other' not in search.publication_types):
                    search.remove_paper(paper)
            except Exception:
                pass

Exemple #6

0

Afficher le fichier

Fichier : test_search_handler.py Projet : skjq/findpapers

def test_save_and_load(search: Search, paper: Paper):

    temp_dirpath = tempfile.mkdtemp()
    temp_filepath = os.path.join(temp_dirpath, 'output.json')

    search.add_paper(paper)

    findpapers.save(search, temp_filepath)

    loaded_search = findpapers.load(temp_filepath)

    assert loaded_search.query == search.query
    assert loaded_search.since == search.since
    assert loaded_search.until == search.until
    assert loaded_search.limit == search.limit
    assert loaded_search.limit_per_database == search.limit_per_database
    assert loaded_search.processed_at.strftime(
        '%Y-%m-%d %H:%M:%S') == search.processed_at.strftime(
            '%Y-%m-%d %H:%M:%S')
    assert len(loaded_search.papers) == len(search.papers)

Exemple #7

0

Afficher le fichier

Fichier : test_ieee_searcher.py Projet : skjq/findpapers

def test_run(search: Search):

    search.limit = 26
    ieee_searcher.run(search, 'fake-api-token')

    assert len(search.papers) == 26

    with pytest.raises(AttributeError):
        ieee_searcher.run(search, '')

    with pytest.raises(AttributeError):
        ieee_searcher.run(search, None)

Exemple #8

0

Afficher le fichier

Fichier : persistence_util.py Projet : skjq/findpapers

def load(search_path: str):
    """
    Method used to load a search result using a JSON representation

    Parameters
    ----------
    search_path : str
        A valid file path containing a JSON representation of the search results
    """

    with open(search_path, 'r') as jsonfile:
        return Search.from_dict(json.load(jsonfile))

Exemple #9

0

Afficher le fichier

def test_get_search_urls(search: Search):

    search.query = '([term a] AND [term b]) OR ([term c] OR [term d])'
    urls = rxiv_searcher._get_search_urls(search, 'medRxiv')

    assert len(urls) == 2

    with pytest.raises(ValueError):  # wildcards not supported
        search.query = '([term a] AND [term ?]) OR ([term c] OR [term d])'
        rxiv_searcher._get_search_urls(search, 'medRxiv')

    with pytest.raises(ValueError):  # AND NOT not supported
        search.query = '([term a] AND NOT [term b]) OR ([term c] OR [term d])'
        rxiv_searcher._get_search_urls(search, 'medRxiv')

    with pytest.raises(ValueError):  # Max 1-level parentheses group
        search.query = '(([term a] OR [term b]) OR ([term c] OR [term d])) OR [term e]'
        rxiv_searcher._get_search_urls(search, 'medRxiv')

    with pytest.raises(ValueError):  # only OR between groups
        search.query = '([term a] AND [term b]) AND ([term c] OR [term d])'
        rxiv_searcher._get_search_urls(search, 'medRxiv')

    with pytest.raises(ValueError):  # Mixed connectors not supported
        search.query = '([term a] AND [term b] OR [term c])'
        rxiv_searcher._get_search_urls(search, 'medRxiv')

Exemple #10

0

Afficher le fichier

Fichier : persistence_util.py Projet : skjq/findpapers

def save(search: Search, outputpath: str):
    """
    Method used to save a search result in a JSON representation

    Parameters
    ----------
    search : Search
        A Search instance
    outputpath : str
        A valid file path used to save the search results
    """

    with open(outputpath, 'w') as jsonfile:
        json.dump(Search.to_dict(search), jsonfile, indent=2, sort_keys=True)

Exemple #11

0

Afficher le fichier

Fichier : search_runner_tool.py Projet : jonatasgrosman/findpapers

def _database_safe_run(function: callable, search: Search, database_label: str):
    """
    Private method that calls a provided function catching all exceptions without rasing them, only logging a ERROR message

    Parameters
    ----------
    function : callable
        A function that will be call for database fetching
    search : Search
        A search instance
    database_label : str
        A database label
    """
    if not search.reached_its_limit(database_label):
        logging.info(f'Fetching papers from {database_label} database...')
        try:
            function()
        except Exception:  # pragma: no cover
            logging.debug(
                f'Error while fetching papers from {database_label} database', exc_info=True)

Exemple #12

0

Afficher le fichier

def run(search: Search):
    """
    This method fetch papers from IEEE database using the provided search parameters
    After fetch the data from IEEE, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if search.publication_types is not None and 'journal' not in search.publication_types:
        logging.info('Skiping PubMed search, journal publication type not in filters. Nowadays the PubMed only retrieves papers published on journals.')
        return

    papers_count = 0
    result = _get_api_result(search)

    if result.get('eSearchResult').get('ErrorList', None) is not None:
        total_papers = 0
    else:
        total_papers = int(result.get('eSearchResult').get('Count'))
    
    logging.info(f'PubMed: {total_papers} papers to fetch')

    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        for pubmed_id in result.get('eSearchResult').get('IdList').get('Id'):

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break
            
            papers_count += 1
            
            try:

                paper_entry = _get_paper_entry(pubmed_id)

                if paper_entry is not None:

                    paper_title = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get(
                        'MedlineCitation').get('Article').get('ArticleTitle')

                    logging.info(f'({papers_count}/{total_papers}) Fetching PubMed paper: {paper_title}')

                    publication = _get_publication(paper_entry)
                    paper = _get_paper(paper_entry, publication)

                    if paper is not None:
                        paper.add_database(DATABASE_LABEL)
                        search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            result = _get_api_result(search, papers_count)

Exemple #13

0

Afficher le fichier

def test_output(search: Search, paper: Paper):

    paper.publication.category = 'Journal'
    paper.categories = {'Facet A': ['Category A', 'Category B']}
    paper.selected = False
    search.add_paper(paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication.issn = 'ISSN-CONF'
    other_paper.publication.category = 'Conference Proceedings'
    other_paper.title = 'Conference paper title'
    other_paper.doi = 'fake-doi-conference-paper'
    other_paper.selected = True
    other_paper.categories = {
        'Facet A': ['Category C'],
        'Facet B': ['Category 1']
    }
    search.add_paper(other_paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication.issn = 'ISSN-BOOK'
    other_paper.publication.category = 'Book'
    other_paper.title = 'Book paper title'
    other_paper.doi = 'fake-doi-book-paper'
    other_paper.categories = None
    search.add_paper(other_paper)

    other_paper = copy.deepcopy(paper)
    other_paper.publication = None
    other_paper.title = 'Unpublished paper title'
    other_paper.doi = None
    other_paper.selected = True
    other_paper.categories = {'Facet A': ['Category A']}
    search.add_paper(other_paper)

    search_path = tempfile.NamedTemporaryFile().name
    outputpath = tempfile.NamedTemporaryFile().name

    persistence_util.save(search, search_path)

    findpapers.generate_bibtex(search_path, outputpath)
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    article_header = '@article{drpaul1969awesome'
    inproceedings_header = '@inproceedings{drpaul1969conference'
    book_header = '@book{drpaul1969book'
    unpublished = '@unpublished{drpaul1969unpublished'

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(search_path,
                               outputpath,
                               only_selected_papers=True)
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header not in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(search_path,
                               outputpath,
                               categories_filter={
                                   'Facet A': ['Category A'],
                                   'Facet B': ['Category 1']
                               })
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished in generated_bibtex

    findpapers.generate_bibtex(
        search_path,
        outputpath,
        categories_filter={'Facet A': ['Category B', 'Category C']})
    with open(outputpath) as fp:
        generated_bibtex = fp.read()

    assert article_header in generated_bibtex
    assert inproceedings_header in generated_bibtex
    assert book_header not in generated_bibtex
    assert unpublished not in generated_bibtex

Exemple #14

0

Afficher le fichier

Fichier : test_pubmed_searcher.py Projet : jonatasgrosman/findpapers

def test_run(search: Search):

    search.limit = 51
    pubmed_searcher.run(search)

    assert len(search.papers) == 51

Exemple #15

0

Afficher le fichier

Fichier : acm_searcher.py Projet : skjq/findpapers

def run(search: Search):
    """
    This method fetch papers from ACM database using the provided search parameters
    After fetch the data from ACM, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    """

    papers_count = 0
    result = _get_result(search)

    try:
        total_papers = int(result.xpath(
            '//*[@class="hitsLength"]')[0].text.strip())
    except Exception:  # pragma: no cover
        total_papers = 0

    logging.info(f'ACM: {total_papers} papers to fetch')

    page_index = 0
    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        papers_urls = [BASE_URL+x.attrib['href']
                       for x in result.xpath('//*[@class="hlFld-Title"]/a')]

        for paper_url in papers_urls:

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break

            try:
                papers_count += 1

                paper_page = _get_paper_page(paper_url)

                paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text

                logging.info(f'({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}')
                
                paper_doi = None
                if '/abs/' in paper_url:
                    paper_doi = paper_url.split('/abs/')[1]
                elif '/book/' in paper_url:
                    paper_doi = paper_url.split('/book/')[1]
                else:
                    paper_doi = paper_url.split('/doi/')[1]

                paper = _get_paper(paper_page, paper_doi, paper_url)

                if paper is None:
                    continue
                
                paper.add_database(DATABASE_LABEL)

                search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            page_index += 1
            result = _get_result(search, page_index)

Exemple #16

0

Afficher le fichier

def search():
    return Search('"this" AND ("that thing" OR "something") AND NOT "anything"', datetime.date(1969, 1, 30), datetime.date(2020, 12, 31), 100, 100)

Exemple #17

0

Afficher le fichier

Fichier : search_runner_tool.py Projet : jonatasgrosman/findpapers

def search(outputpath: str, query: Optional[str] = None, since: Optional[datetime.date] = None, until: Optional[datetime.date] = None,
        limit: Optional[int] = None, limit_per_database: Optional[int] = None, databases: Optional[List[str]] = None,
        publication_types: Optional[List[str]] = None, scopus_api_token: Optional[str] = None, ieee_api_token: Optional[str] = None,
        proxy: Optional[str] = None, verbose: Optional[bool] = False):
    """
    When you have a query and needs to get papers using it, this is the method that you'll need to call.
    This method will find papers from some databases based on the provided query.

    Parameters
    ----------
    outputpath : str
        A valid file path where the search result file will be placed

    query : str, optional

        A query string that will be used to perform the papers search.
        
        If not provided, the query will be loaded from the environment variable FINDPAPERS_QUERY

        All the query terms need to be enclosed in quotes and can be associated using boolean operators,
        and grouped using parentheses. 
        E.g.: [term A] AND ([term B] OR [term C]) AND NOT [term D]

        You can use some wildcards in the query too. Use ? to replace a single character or * to replace any number of characters. 
        E.g.: "son?" -> will match song, sons, ...
        E.g.: "son*" -> will match song, sons, sonar, songwriting, ...

        Note: All boolean operators needs to be uppercased. The boolean operator "NOT" must be preceded by an "AND" operator.

    since : Optional[datetime.date], optional
        A lower bound (inclusive) date that will be used to filter the search results, by default None

    until : Optional[datetime.date], optional
        A upper bound (inclusive) date that will be used to filter the search results, by default None

    limit : Optional[int], optional
        The max number of papers to collect, by default None

    limit_per_database : Optional[int], optional
        The max number of papers to collect per each database, by default None

    databases : List[str], optional
        List of databases where the search should be performed, if not specified all databases will be used, by default None

    publication_types : List[str], optional
        List of publication list of publication types to filter when searching, if not specified all the publication types 
        will be collected (this parameter is case insensitive). The available publication types are: journal, conference proceedings, book, other, by default None

    scopus_api_token : Optional[str], optional
        A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None

    ieee_api_token : Optional[str], optional
        A API token used to fetch data from IEEE database. If you don't have one go to https://developer.ieee.org and get it, by default None
    
    proxy : Optional[str], optional
        proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None

    verbose : Optional[bool], optional
        If you wanna a verbose logging
    """

    common_util.logging_initialize(verbose)

    if proxy is not None:
        os.environ['FINDPAPERS_PROXY'] = proxy
    
    logging.info('Let\'s find some papers, this process may take a while...')

    if databases is not None:
        databases = [x.lower() for x in databases]
    
    if publication_types is not None:
        publication_types = [x.lower().strip() for x in publication_types]
        for publication_type in publication_types:
            if publication_type not in ['journal', 'conference proceedings', 'book', 'other']:
                raise ValueError(f'Invalid publication type: {publication_type}')

    if query is None:
        query = os.getenv('FINDPAPERS_QUERY')

    if query is not None:
        query = _sanitize_query(query)

    if query is None or not _is_query_ok(query):
        raise ValueError('Invalid query format')

    common_util.check_write_access(outputpath)

    if ieee_api_token is None:
        ieee_api_token = os.getenv('FINDPAPERS_IEEE_API_TOKEN')

    if scopus_api_token is None:
        scopus_api_token = os.getenv('FINDPAPERS_SCOPUS_API_TOKEN')

    search = Search(query, since, until, limit, limit_per_database, databases=databases, publication_types=publication_types)

    if databases is None or arxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: arxiv_searcher.run(search),
                        search, arxiv_searcher.DATABASE_LABEL)
    
    if databases is None or pubmed_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: pubmed_searcher.run(search),
                        search, pubmed_searcher.DATABASE_LABEL)

    if databases is None or acm_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: acm_searcher.run(search),
                        search, acm_searcher.DATABASE_LABEL)

    if ieee_api_token is not None:
        if databases is None or ieee_searcher.DATABASE_LABEL.lower() in databases:
            _database_safe_run(lambda: ieee_searcher.run(
                search, ieee_api_token), search, ieee_searcher.DATABASE_LABEL)
    else:
        logging.info('IEEE API token not found, skipping search on this database')

    if scopus_api_token is not None:
        if databases is None or scopus_searcher.DATABASE_LABEL.lower() in databases:
            _database_safe_run(lambda: scopus_searcher.run(
                search, scopus_api_token), search, scopus_searcher.DATABASE_LABEL)
    else:
        logging.info('Scopus API token not found, skipping search on this database')

    if databases is None or medrxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: medrxiv_searcher.run(search),
                        search, medrxiv_searcher.DATABASE_LABEL)

    if databases is None or biorxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: biorxiv_searcher.run(search),
                        search, biorxiv_searcher.DATABASE_LABEL)

    logging.info('Enriching results...')

    _enrich(search, scopus_api_token)

    logging.info('Filtering results...')

    _filter(search)

    logging.info('Finding and merging duplications...')

    search.merge_duplications()

    logging.info('Flagging potentially predatory publications...')

    _flag_potentially_predatory_publications(search)

    logging.info(f'It\'s finally over! {len(search.papers)} papers retrieved. Good luck with your research :)')

    persistence_util.save(search, outputpath)

Exemple #18

0

Afficher le fichier

Fichier : test_models.py Projet : jonatasgrosman/findpapers

def test_search(paper: Paper):

    paper.doi = None

    search = Search('this AND that', datetime.date(1969, 1, 30),
                    datetime.date(1970, 4, 8), 2)

    assert len(search.papers) == 0

    search.add_paper(paper)
    assert len(search.papers) == 1
    search.add_paper(paper)
    assert len(search.papers) == 1

    another_paper = Paper('awesome paper title 2', 'a long abstract',
                          paper.authors, paper.publication,
                          paper.publication_date, paper.urls)
    another_paper.add_database('arXiv')

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    assert paper == search.get_paper(paper.title, paper.publication_date,
                                     paper.doi)
    assert paper.publication == search.get_publication(paper.publication.title,
                                                       paper.publication.issn,
                                                       paper.publication.isbn)

    search.remove_paper(another_paper)
    assert len(search.papers) == 1
    assert paper in search.papers

    search.limit_per_database = 1
    with pytest.raises(OverflowError):
        search.add_paper(another_paper)
    search.limit_per_database = 2

    search.add_paper(another_paper)
    assert len(search.papers) == 2

    another_paper_2 = copy.deepcopy(paper)
    another_paper_2.title = 'awesome paper title 3'
    another_paper_2.abstract = 'a long abstract'
    another_paper_2.databases = set()

    with pytest.raises(ValueError):
        search.add_paper(another_paper_2)

    another_paper_2.add_database('arXiv')

    with pytest.raises(OverflowError):
        search.add_paper(another_paper_2)

    search.merge_duplications()
    assert len(search.papers) == 1

    publication_title = 'FAKE-TITLE'
    publication_issn = 'FAKE-ISSN'
    publication_isbn = 'FAKE-ISBN'
    assert search.get_publication_key(
        publication_title, publication_issn,
        publication_isbn) == f'ISBN-{publication_isbn.lower()}'
    assert search.get_publication_key(
        publication_title,
        publication_issn) == f'ISSN-{publication_issn.lower()}'
    assert search.get_publication_key(
        publication_title) == f'TITLE-{publication_title.lower()}'

Exemple #19

0

Afficher le fichier

def run(search: Search):
    """
    This method fetch papers from arXiv database using the provided search parameters
    After fetch the data from arXiv, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance

    """

    papers_count = 0
    result = _get_api_result(search)

    total_papers = int(
        result.get('feed').get('opensearch:totalResults').get('#text'))

    logging.info(f'arXiv: {total_papers} papers to fetch')

    while (papers_count < total_papers
           and not search.reached_its_limit(DATABASE_LABEL)):

        entries = result.get('feed', {}).get('entry', [])
        if type(
                entries
        ) != list:  # if there's only one entry the result is not a list just a dict
            entries = [entries]

        for paper_entry in entries:

            if papers_count >= total_papers or search.reached_its_limit(
                    DATABASE_LABEL):
                break

            papers_count += 1

            try:

                paper_title = paper_entry.get("title")
                logging.info(
                    f'({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}'
                )

                published_date = datetime.datetime.strptime(
                    paper_entry.get('published')[:10], '%Y-%m-%d').date()

                # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves'
                if search.since is not None and published_date < search.since:
                    logging.info(
                        'Skipping paper due to "since" date constraint')
                    continue
                elif search.until is not None and published_date > search.until:
                    logging.info(
                        'Skipping paper due to "until" date constraint')
                    continue

                publication = _get_publication(paper_entry)
                paper = _get_paper(paper_entry, published_date, publication)

                if paper is not None:
                    paper.add_database(DATABASE_LABEL)
                    search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(
                DATABASE_LABEL):
            time.sleep(1)  # sleep for 1 second to avoid server blocking
            result = _get_api_result(search, papers_count)

Exemple #20

0

Afficher le fichier

Fichier : scopus_searcher.py Projet : skjq/findpapers

def run(search: Search, api_token: str, url: Optional[str] = None, papers_count: Optional[int] = 0):
    """
    This method fetch papers from Scopus database using the provided search parameters
    After fetch the data from Scopus, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from Scopus database,
    url : Optional[str]
        A predefined URL to be used for the search execution, 
        this is usually used for make the next recursive call on a result pagination
    papers_count : Optional[int]
        Papers count used on recursion calls

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')

    search_results = _get_search_results(search, api_token, url)

    total_papers = int(search_results.get('opensearch:totalResults', 0))

    logging.info(f'Scopus: {total_papers} papers to fetch')

    for paper_entry in search_results.get('entry', []):

        if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
            break

        papers_count += 1

        try:

            paper_title = paper_entry.get("dc:title")
            logging.info(f'({papers_count}/{total_papers}) Fetching Scopus paper: {paper_title}')

            publication = _get_publication(paper_entry, api_token)
            paper = _get_paper(paper_entry, publication)

            if paper is not None:
                paper.add_database(DATABASE_LABEL)
                search.add_paper(paper)

        except Exception as e:  # pragma: no cover
            logging.debug(e, exc_info=True)

    next_url = None
    for link in search_results['link']:
        if link['@ref'] == 'next':
            next_url = link['@href']
            break

    # If there is a next url, the API provided response was paginated and we need to process the next url
    # We'll make a recursive call for it
    if papers_count < total_papers and next_url is not None and not search.reached_its_limit(DATABASE_LABEL):
        run(search, api_token, next_url, papers_count)