Exemple #1
0
def extract_add_info(arts, art_id, art):
    """Extract information from an article and add it to a data object.

    Parameters
    ----------
    arts : Articles
        Object to store information for the current article.
    art_id : int
        ID of the new article.
    art : bs4.element.Tag
        Extracted article.

    Returns
    -------
    arts : Articles
        Object updated with data from the current article.
    """

    arts.add_data('ids', art_id)
    arts.add_data('titles', extract(art, 'ArticleTitle', 'str'))
    arts.add_data('authors', process_authors(extract(art, 'AuthorList',
                                                     'raw')))
    arts.add_data(
        'journals',
        (extract(art, 'Title', 'str'), extract(art, 'ISOAbbreviation', 'str')))
    arts.add_data('words', extract(art, 'AbstractText', 'all-str'))
    arts.add_data('keywords', extract(art, 'Keyword', 'all-list'))
    arts.add_data('years', process_pub_date(extract(art, 'PubDate', 'raw')))
    arts.add_data('dois', process_ids(extract(art, 'ArticleId', 'all'), 'doi'))

    return arts
Exemple #2
0
def get_db_info(req, info_url):
    """Calls EInfo to get info and status of the database to be used for data collection.

    Parameters
    ----------
    req : Requester
        Object to launch requests from.
    info_url : str
        URL to request db information from.

    Returns
    -------
    db_info : dict
        Information about the database from which the data was accessed.
    """

    # Get the info page and parse with BeautifulSoup
    info_page = req.request_url(info_url)
    info_page_soup = BeautifulSoup(info_page.content, 'lxml')

    # Set list of fields to extract from EInfo
    fields = [
        'dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate'
    ]

    # Extract basic information into a dictionary
    db_info = dict()
    for field in fields:
        db_info[field] = extract(info_page_soup, field, 'str')

    return db_info
Exemple #3
0
def get_count(req, url):
    """Get the count of how many articles listed at the requested URL.

    Parameters
    ----------
    req : Requester
        Object to launch requests from.
    url : str
        URL to request count data from.

    Returns
    -------
    count : int
        Count of the number of articles found.
    """

    page = req.request_url(url)
    page_soup = BeautifulSoup(page.content, 'lxml')

    counts = extract(page_soup, 'count', 'all')

    try:
        count = int(counts[0].text)
    except IndexError:
        count = 0

    return count
Exemple #4
0
def get_articles(req, art_url, arts):
    """Collect information for each article found for a given term.

    Parameters
    ----------
    req : Requester
        Requester object to launch requests from.
    art_url : str
        URL for the article to be collected.
    arts : Articles
        Object to add data to.

    Returns
    -------
    arts : Articles
        Object to store information for the current term.
    """

    # Get page of all articles
    art_page = req.request_url(art_url)
    art_page_soup = BeautifulSoup(art_page.content, 'xml')
    articles = art_page_soup.findAll('PubmedArticle')

    # Loop through each article, extracting relevant information
    for art in articles:

        # Get ID of current article & extract and add info to data object
        new_id = process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')
        arts = extract_add_info(arts, new_id, art)

    return arts
Exemple #5
0
def get_db_info(req, info_url):
    """Calls EInfo to get info and status of the database to be used for data collection.

    Parameters
    ----------
    req : Requester
        Object to launch requests from.
    info_url : str
        URL to request db information from.

    Returns
    -------
    db_info : dict
        Information about the database from which the data was accessed.

    Examples
    --------
    Get info on the pubmed database:

    >>> from lisc.requester import Requester
    >>> url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=pubmed'
    >>> db_info = get_db_info(Requester(), url)
    """

    # Get the info page and parse with BeautifulSoup
    info_page = req.request_url(info_url)
    info_page_soup = BeautifulSoup(info_page.content, 'lxml')

    # Set list of fields to extract from EInfo
    fields = [
        'dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate'
    ]

    # Extract basic information into a dictionary
    db_info = dict()
    for field in fields:
        db_info[field] = extract(info_page_soup, field, 'str')

    return db_info