Esempio n. 1
0
def run(search: Search, api_token: str):
    """
    This method fetch papers from IEEE database using the provided search parameters
    After fetch the data from IEEE, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')

    papers_count = 0
    result = _get_api_result(search, api_token)
    total_papers = result.get('total_records')

    logging.info(f'IEEE: {total_papers} papers to fetch')

    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        for paper_entry in result.get('articles'):

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break
            
            papers_count += 1

            try:

                logging.info(f'({papers_count}/{total_papers}) Fetching IEEE paper: {paper_entry.get("title")}')

                publication = _get_publication(paper_entry)
                paper = _get_paper(paper_entry, publication)

                if paper is not None:
                    paper.add_database(DATABASE_LABEL)
                    search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            result = _get_api_result(search, api_token, papers_count+1)
Esempio n. 2
0
def run(search: Search, database: str):
    """
    This method fetch papers from medRxiv/bioRxiv database using the provided search parameters
    After fetch the data from medRxiv/bioRxiv, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    database : str
        The database name (medRxiv or bioRxiv)
    """

    urls = _get_search_urls(search, database)

    for i, url in enumerate(urls):

        if search.reached_its_limit(database):
            break

        logging.info(f'{database}: Requesting for papers...')

        data = _get_data(url)

        total_papers = 0
        if len(data) > 0:
            total_papers = data[0].get('total_papers')

        logging.info(f'{database}: {total_papers} papers to fetch from {i+1}/{len(urls)} papers requests')

        papers_count = 0
        dois = sum([d.get('dois') for d in [x for x in data]], [])

        for doi in dois:
            if papers_count >= total_papers or search.reached_its_limit(database):
                break
            try:
                papers_count += 1
                paper_metadata = _get_paper_metadata(doi, database)

                paper_title = paper_metadata.get('title')
                
                logging.info(f'({papers_count}/{total_papers}) Fetching {database} paper: {paper_title}')
                
                paper = _get_paper(paper_metadata)
                
                paper.add_database(database)

                search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)
def _database_safe_run(function: callable, search: Search, database_label: str):
    """
    Private method that calls a provided function catching all exceptions without rasing them, only logging a ERROR message

    Parameters
    ----------
    function : callable
        A function that will be call for database fetching
    search : Search
        A search instance
    database_label : str
        A database label
    """
    if not search.reached_its_limit(database_label):
        logging.info(f'Fetching papers from {database_label} database...')
        try:
            function()
        except Exception:  # pragma: no cover
            logging.debug(
                f'Error while fetching papers from {database_label} database', exc_info=True)
Esempio n. 4
0
def run(search: Search):
    """
    This method fetch papers from IEEE database using the provided search parameters
    After fetch the data from IEEE, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from IEEE database,

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if search.publication_types is not None and 'journal' not in search.publication_types:
        logging.info('Skiping PubMed search, journal publication type not in filters. Nowadays the PubMed only retrieves papers published on journals.')
        return

    papers_count = 0
    result = _get_api_result(search)

    if result.get('eSearchResult').get('ErrorList', None) is not None:
        total_papers = 0
    else:
        total_papers = int(result.get('eSearchResult').get('Count'))
    
    logging.info(f'PubMed: {total_papers} papers to fetch')

    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        for pubmed_id in result.get('eSearchResult').get('IdList').get('Id'):

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break
            
            papers_count += 1
            
            try:

                paper_entry = _get_paper_entry(pubmed_id)

                if paper_entry is not None:

                    paper_title = paper_entry.get('PubmedArticleSet').get('PubmedArticle').get(
                        'MedlineCitation').get('Article').get('ArticleTitle')

                    logging.info(f'({papers_count}/{total_papers}) Fetching PubMed paper: {paper_title}')

                    publication = _get_publication(paper_entry)
                    paper = _get_paper(paper_entry, publication)

                    if paper is not None:
                        paper.add_database(DATABASE_LABEL)
                        search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            result = _get_api_result(search, papers_count)
Esempio n. 5
0
def run(search: Search):
    """
    This method fetch papers from ACM database using the provided search parameters
    After fetch the data from ACM, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    """

    papers_count = 0
    result = _get_result(search)

    try:
        total_papers = int(result.xpath(
            '//*[@class="hitsLength"]')[0].text.strip())
    except Exception:  # pragma: no cover
        total_papers = 0

    logging.info(f'ACM: {total_papers} papers to fetch')

    page_index = 0
    while(papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL)):

        papers_urls = [BASE_URL+x.attrib['href']
                       for x in result.xpath('//*[@class="hlFld-Title"]/a')]

        for paper_url in papers_urls:

            if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
                break

            try:
                papers_count += 1

                paper_page = _get_paper_page(paper_url)

                paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text

                logging.info(f'({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}')
                
                paper_doi = None
                if '/abs/' in paper_url:
                    paper_doi = paper_url.split('/abs/')[1]
                elif '/book/' in paper_url:
                    paper_doi = paper_url.split('/book/')[1]
                else:
                    paper_doi = paper_url.split('/doi/')[1]

                paper = _get_paper(paper_page, paper_doi, paper_url)

                if paper is None:
                    continue
                
                paper.add_database(DATABASE_LABEL)

                search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL):
            page_index += 1
            result = _get_result(search, page_index)
Esempio n. 6
0
def run(search: Search, api_token: str, url: Optional[str] = None, papers_count: Optional[int] = 0):
    """
    This method fetch papers from Scopus database using the provided search parameters
    After fetch the data from Scopus, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance
    api_token : str
        The API key used to fetch data from Scopus database,
    url : Optional[str]
        A predefined URL to be used for the search execution, 
        this is usually used for make the next recursive call on a result pagination
    papers_count : Optional[int]
        Papers count used on recursion calls

    Raises
    ------
    AttributeError
        - The API token cannot be null
    """

    if api_token is None or len(api_token.strip()) == 0:
        raise AttributeError('The API token cannot be null')

    search_results = _get_search_results(search, api_token, url)

    total_papers = int(search_results.get('opensearch:totalResults', 0))

    logging.info(f'Scopus: {total_papers} papers to fetch')

    for paper_entry in search_results.get('entry', []):

        if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL):
            break

        papers_count += 1

        try:

            paper_title = paper_entry.get("dc:title")
            logging.info(f'({papers_count}/{total_papers}) Fetching Scopus paper: {paper_title}')

            publication = _get_publication(paper_entry, api_token)
            paper = _get_paper(paper_entry, publication)

            if paper is not None:
                paper.add_database(DATABASE_LABEL)
                search.add_paper(paper)

        except Exception as e:  # pragma: no cover
            logging.debug(e, exc_info=True)

    next_url = None
    for link in search_results['link']:
        if link['@ref'] == 'next':
            next_url = link['@href']
            break

    # If there is a next url, the API provided response was paginated and we need to process the next url
    # We'll make a recursive call for it
    if papers_count < total_papers and next_url is not None and not search.reached_its_limit(DATABASE_LABEL):
        run(search, api_token, next_url, papers_count)
Esempio n. 7
0
def run(search: Search):
    """
    This method fetch papers from arXiv database using the provided search parameters
    After fetch the data from arXiv, the collected papers are added to the provided search instance

    Parameters
    ----------
    search : Search
        A search instance

    """

    papers_count = 0
    result = _get_api_result(search)

    total_papers = int(
        result.get('feed').get('opensearch:totalResults').get('#text'))

    logging.info(f'arXiv: {total_papers} papers to fetch')

    while (papers_count < total_papers
           and not search.reached_its_limit(DATABASE_LABEL)):

        entries = result.get('feed', {}).get('entry', [])
        if type(
                entries
        ) != list:  # if there's only one entry the result is not a list just a dict
            entries = [entries]

        for paper_entry in entries:

            if papers_count >= total_papers or search.reached_its_limit(
                    DATABASE_LABEL):
                break

            papers_count += 1

            try:

                paper_title = paper_entry.get("title")
                logging.info(
                    f'({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}'
                )

                published_date = datetime.datetime.strptime(
                    paper_entry.get('published')[:10], '%Y-%m-%d').date()

                # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves'
                if search.since is not None and published_date < search.since:
                    logging.info(
                        'Skipping paper due to "since" date constraint')
                    continue
                elif search.until is not None and published_date > search.until:
                    logging.info(
                        'Skipping paper due to "until" date constraint')
                    continue

                publication = _get_publication(paper_entry)
                paper = _get_paper(paper_entry, published_date, publication)

                if paper is not None:
                    paper.add_database(DATABASE_LABEL)
                    search.add_paper(paper)

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if papers_count < total_papers and not search.reached_its_limit(
                DATABASE_LABEL):
            time.sleep(1)  # sleep for 1 second to avoid server blocking
            result = _get_api_result(search, papers_count)