コード例 #1
0
ファイル: test_core_urls.py プロジェクト: njr175/ERP_SCANR
def test_build_fetch():
    """Test the build_fetch() method from URLS()."""

    urls = URLS(db='pubmed', retmax='500', field='id', retmode='xml')

    urls.build_fetch(['db', 'retmode'])

    assert urls.fetch
コード例 #2
0
ファイル: words.py プロジェクト: njr175/ERP_SCANR
    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()
コード例 #3
0
ファイル: words.py プロジェクト: njr175/ERP_SCANR
    def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False):
        """Search through pubmed for all abstracts referring to a given ERP.

        The scraping does an exact word search for the ERP term given.
        It then loops through all the artciles found about that data.
        For each article, pulls title, year and word data.

        Notes
        -----
        - Pulls data using the hierarchical tag structure that organize the articles.
        - Initially, the procedure was to pull all tags of a certain type.
            For example: extract all 'DateCreated' tags.
            This procedure fails (or badly organizes data) when an articles is
                missing a particular tag.
            Now: take advantage of the hierarchy, loop through each article tag.
                From here, pull out the data, if available.
                This way, can deal with cases of missing data.
        """

        # Set date of when data was collected
        self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

        # Get e-utils URLS object
        if use_hist: hist_val = 'y'
        else: hist_val = 'n'
        urls = URLS(db=db,
                    usehistory=hist_val,
                    retmax=retmax,
                    retmode='xml',
                    field='TIAB',
                    auto_gen=False)
        urls.build_info(['db'])
        urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field'])
        urls.build_fetch(['db', 'retmode'])

        # Get current information about database being used
        self.get_db_info(urls.info)

        # Loop through all the erps
        #for ind, erp in enumerate(self.erps):
        for ind, lab in enumerate(self.labels):

            # Print out status
            print('Scraping words for: ', lab)

            # Initiliaze object to store data for current erp papers
            cur_erp = ERPData(lab, self.erps[ind])

            # Set up search terms - add exclusions, if there are any
            if self.exclusions[ind][0]:
                #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(
                    self.exclusions[ind], 'not')
            else:
                #term_arg = '"' + erp[0] + '"'
                term_arg = comb_terms(self.erps[ind], 'or')

            # Create the url for the erp search term
            url = urls.search + term_arg

            # Get page and parse
            page = self.req.get_url(url)
            page_soup = BeautifulSoup(page.content, 'lxml')

            # Using history
            if use_hist:

                #
                ret_start = 0
                ret_max = 100

                #
                count = int(page_soup.find('count').text)
                web_env = page_soup.find('webenv').text
                query_key = page_soup.find('querykey').text

                # Update History
                cur_erp.update_history('Start Scrape')

                #
                while ret_start < count:

                    #
                    art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \
                              '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max)
                    art_page = self.req.get_url(art_url)
                    art_page_soup = BeautifulSoup(art_page.content, "xml")

                    # Pull out articles
                    articles = art_page_soup.findAll('PubmedArticle')

                    # Loop through each article, extracting relevant information
                    for ind, art in enumerate(articles):

                        # Get ID of current article
                        new_id = _process_ids(extract(art, 'ArticleId', 'all'),
                                              'pubmed')
                        #new_id = int(ids[ind].text)

                        # Extract and add all relevant info from current articles to ERPData object
                        cur_erp = self.extract_add_info(cur_erp, new_id, art)

                    #
                    ret_start += ret_max

            # Without using history
            else:

                # Get all ids
                ids = page_soup.find_all('id')

                # Convert ids to string
                ids_str = _ids_to_str(ids)

                # Get article page
                art_url = urls.fetch + '&id=' + ids_str
                art_page = self.req.get_url(art_url)
                art_page_soup = BeautifulSoup(art_page.content, "xml")

                # Pull out articles
                articles = art_page_soup.findAll('PubmedArticle')

                # Update History
                cur_erp.update_history('Start Scrape')

                # Loop through each article, extracting relevant information
                for ind, art in enumerate(articles):

                    # Get ID of current article
                    new_id = int(ids[ind].text)

                    # Extract and add all relevant info from current articles to ERPData object
                    cur_erp = self.extract_add_info(cur_erp, new_id, art)

            # Check consistency of extracted results
            cur_erp.check_results()
            cur_erp.update_history('End Scrape')

            # Save out and clear data
            cur_erp.save_n_clear()

            # Add the object with current erp data to results list
            self.add_results(cur_erp)

        # Set Requester object as finished being used
        self.req.close()