Esempio n. 1
0
def _process_authors(author_list):
    """

    Parameters
    ----------
    author_list : bs4.element.Tag
        AuthorList tag, which contains tags related to author data.

    Returns
    -------
    out : list of tuple of (str, str, str, str)
        List of authors, each as (LastName, FirstName, Initials, Affiliation).
    """

    # Pull out all author tags from the input
    authors = extract(author_list, 'Author', 'all')

    # Initialize list to return
    out = []

    # Extract data for each author
    for author in authors:
        out.append((extract(author, 'LastName',
                            'str'), extract(author, 'ForeName', 'str'),
                    extract(author, 'Initials',
                            'str'), extract(author, 'Affiliation', 'str')))

    return out
Esempio n. 2
0
def _process_authors(author_list):
    """

    Parameters
    ----------
    author_list : bs4.element.Tag
        AuthorList tag, which contains tags related to author data.

    Returns
    -------
    out : list of tuple of (str, str, str, str)
        List of authors, each as (LastName, FirstName, Initials, Affiliation).
    """

    # Pull out all author tags from the input
    authors = extract(author_list, 'Author', 'all')

    # Initialize list to return
    out = []

    # Extract data for each author
    for author in authors:
        out.append((extract(author, 'LastName', 'str'), extract(author, 'ForeName', 'str'),
                    extract(author, 'Initials', 'str'), extract(author, 'Affiliation', 'str')))

    return out
Esempio n. 3
0
    def extract_add_info(cur_erp, new_id, art):
        """Extract information from article web page and add to

        Parameters
        ----------
        cur_erp : ERPData() object
            Object to store information for the current ERP term.
        new_id : int
            Paper ID of the new paper.
        art : bs4.element.Tag() object
            Extracted pubmed article.

        NOTES
        -----
        - Data extraction is all in try/except statements in order to
        deal with missing data, since fields may be missing.
        """

        # Add ID of current article
        cur_erp.add_id(new_id)
        cur_erp.add_title(extract(art, 'ArticleTitle', 'str'))
        cur_erp.add_authors(_process_authors(extract(art, 'AuthorList', 'raw')))
        cur_erp.add_journal(extract(art, 'Title', 'str'), extract(art, 'ISOAbbreviation', 'str'))
        cur_erp.add_words(_process_words(extract(art, 'AbstractText', 'str')))
        cur_erp.add_kws(_process_kws(extract(art, 'Keyword', 'all')))
        cur_erp.add_pub_date(_process_pub_date(extract(art, 'PubDate', 'raw')))
        cur_erp.add_doi(_process_ids(extract(art, 'ArticleId', 'all'), 'doi'))

        # Increment number of articles included in ERPData
        cur_erp.increment_n_articles()

        return cur_erp
Esempio n. 4
0
    def extract_add_info(cur_erp, new_id, art):
        """Extract information from article web page and add to

        Parameters
        ----------
        cur_erp : ERPData() object
            Object to store information for the current ERP term.
        new_id : int
            Paper ID of the new paper.
        art : bs4.element.Tag() object
            Extracted pubmed article.

        NOTES
        -----
        - Data extraction is all in try/except statements in order to
        deal with missing data, since fields may be missing.
        """

        # Add ID of current article
        cur_erp.add_id(new_id)
        cur_erp.add_title(extract(art, 'ArticleTitle', 'str'))
        cur_erp.add_authors(_process_authors(extract(art, 'AuthorList',
                                                     'raw')))
        cur_erp.add_journal(extract(art, 'Title', 'str'),
                            extract(art, 'ISOAbbreviation', 'str'))
        cur_erp.add_words(_process_words(extract(art, 'AbstractText', 'str')))
        cur_erp.add_kws(_process_kws(extract(art, 'Keyword', 'all')))
        cur_erp.add_pub_date(_process_pub_date(extract(art, 'PubDate', 'raw')))
        cur_erp.add_doi(_process_ids(extract(art, 'ArticleId', 'all'), 'doi'))

        # Increment number of articles included in ERPData
        cur_erp.increment_n_articles()

        return cur_erp
Esempio n. 5
0
def test_extract():
    """Test the extract function."""

    # Create a complex tag
    out = bs4.element.Tag(name='Out')
    inn1 = bs4.element.Tag(name='Inn')
    inn2 = bs4.element.Tag(name='Inn')

    inn1.append('words words')
    inn2.append('more words')

    out.append(inn1)
    out.append(inn2)

    # Test error - bad how
    with raises(ValueError):
        out_err = extract(out, 'Inn', 'bad')

    # Test how = 'raw'
    out_raw = extract(out, 'Inn', 'raw')
    assert type(out_raw) is bs4.element.Tag

    # DROPPED CASE WITH MOVE TO PY35
    # Test how = 'txt'
    #out_txt = extract(out, 'Inn', 'txt')
    #assert isinstance(out_txt, UnicodeType)
    #assert out_txt == unicode('words words')

    # Test how = 'str'
    out_str = extract(out, 'Inn', 'str')
    #TODO: Figure this out? Whats the return type?
    #assert isinstance(out_str, str)
    #assert out_str == 'words words'

    # Test how = 'all'
    out_all = extract(out, 'Inn', 'all')
    assert type(out_all) is bs4.element.ResultSet

    # Test with non-existent tag name
    out_none = extract(out, 'bad', 'raw')
    assert out_none is None
Esempio n. 6
0
def _process_pub_date(pub_date):
    """

    Parameters
    ----------
    pub_date : bs4.element.Tag
        PubDate tag, which contains tags with publication date information.

    Returns
    -------
    year : int
        xx
    month : str
        xx
    """

    # Extract year, convert to int if not None
    year = extract(pub_date, 'Year', 'str')
    if year: year = int(year)

    # Extract month
    month = extract(pub_date, 'Month', 'str')

    return year, month
Esempio n. 7
0
def _process_pub_date(pub_date):
    """

    Parameters
    ----------
    pub_date : bs4.element.Tag
        PubDate tag, which contains tags with publication date information.

    Returns
    -------
    year : int
        xx
    month : str
        xx
    """

    # Extract year, convert to int if not None
    year = extract(pub_date, 'Year', 'str')
    if year: year = int(year)

    # Extract month
    month = extract(pub_date, 'Month', 'str')

    return year, month
Esempio n. 8
0
    def _get_count(self, url):
        """Get the count of how many articles listed on search results URL.

        Parameters
        ----------
        url : str
            URL to search with.
        """

        # Request page from URL
        page = self.req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Get all count tags
        counts = extract(page_soup, 'count', 'all')

        return int(counts[0].text)
Esempio n. 9
0
    def _get_count(self, url):
        """Get the count of how many articles listed on search results URL.

        Parameters
        ----------
        url : str
            URL to search with.
        """

        # Request page from URL
        page = self.req.get_url(url)
        page_soup = BeautifulSoup(page.content, 'lxml')

        # Get all count tags
        counts = extract(page_soup, 'count', 'all')

        return int(counts[0].text)
Esempio n. 10
0
    def get_db_info(self, info_url):
        """Calls EInfo to get info and status of db to be used for scraping.

        Parameters
        ----------
        info_url : str
            URL to request db information from.
        """

        # Get the info page and parse with BeautifulSoup
        info_page = self.req.get_url(info_url)
        info_page_soup = BeautifulSoup(info_page.content, 'lxml')

        # Set list of fields to extract from eInfo
        fields = ['dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate']

        # Extract basic infomation into a dictionary
        for field in fields:
            self.db_info[field] = extract(info_page_soup, field, 'str')
Esempio n. 11
0
    def get_db_info(self, info_url):
        """Calls EInfo to get info and status of db to be used for scraping.

        Parameters
        ----------
        info_url : str
            URL to request db information from.
        """

        # Get the info page and parse with BeautifulSoup
        info_page = self.req.get_url(info_url)
        info_page_soup = BeautifulSoup(info_page.content, 'lxml')

        # Set list of fields to extract from eInfo
        fields = [
            'dbname', 'menuname', 'description', 'dbbuild', 'count',
            'lastupdate'
        ]

        # Extract basic infomation into a dictionary
        for field in fields:
            self.db_info[field] = extract(info_page_soup, field, 'str')
Esempio n. 12
0
    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()
Esempio n. 13
0
    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db,
                    usehistory=hist_val,
                    retmax=retmax,
                    retmode='xml',
                    field='TIAB',
                    auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(
                    self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'),
                                              'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()