def test_build_fetch(): """Test the build_fetch() method from URLS().""" urls = URLS(db='pubmed', retmax='500', field='id', retmode='xml') urls.build_fetch(['db', 'retmode']) assert urls.fetch
def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False): """Search through pubmed for all abstracts referring to a given ERP. The scraping does an exact word search for the ERP term given. It then loops through all the artciles found about that data. For each article, pulls title, year and word data. Notes ----- - Pulls data using the hierarchical tag structure that organize the articles. - Initially, the procedure was to pull all tags of a certain type. For example: extract all 'DateCreated' tags. This procedure fails (or badly organizes data) when an articles is missing a particular tag. Now: take advantage of the hierarchy, loop through each article tag. From here, pull out the data, if available. This way, can deal with cases of missing data. """ # Set date of when data was collected self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object if use_hist: hist_val = 'y' else: hist_val = 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used self.get_db_info(urls.info) # Loop through all the erps #for ind, erp in enumerate(self.erps): for ind, lab in enumerate(self.labels): # Print out status print('Scraping words for: ', lab) # Initiliaze object to store data for current erp papers cur_erp = ERPData(lab, self.erps[ind]) # Set up search terms - add exclusions, if there are any if self.exclusions[ind][0]: #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"' term_arg = comb_terms(self.erps[ind], 'or') + comb_terms(self.exclusions[ind], 'not') else: #term_arg = '"' + erp[0] + '"' term_arg = comb_terms(self.erps[ind], 'or') # Create the url for the erp search term url = urls.search + term_arg # Get page and parse page = self.req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # ret_start = 0 ret_max = 100 # count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Update History cur_erp.update_history('Start Scrape') # while ret_start < count: # art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max) art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed') #new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # ret_start += ret_max # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page art_url = urls.fetch + '&id=' + ids_str art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Update History cur_erp.update_history('Start Scrape') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # Check consistency of extracted results cur_erp.check_results() cur_erp.update_history('End Scrape') # Save out and clear data cur_erp.save_n_clear() # Add the object with current erp data to results list self.add_results(cur_erp) # Set Requester object as finished being used self.req.close()
def scrape_data(self, db=None, retmax=None, use_hist=False, verbose=False): """Search through pubmed for all abstracts referring to a given ERP. The scraping does an exact word search for the ERP term given. It then loops through all the artciles found about that data. For each article, pulls title, year and word data. Notes ----- - Pulls data using the hierarchical tag structure that organize the articles. - Initially, the procedure was to pull all tags of a certain type. For example: extract all 'DateCreated' tags. This procedure fails (or badly organizes data) when an articles is missing a particular tag. Now: take advantage of the hierarchy, loop through each article tag. From here, pull out the data, if available. This way, can deal with cases of missing data. """ # Set date of when data was collected self.date = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") # Get e-utils URLS object if use_hist: hist_val = 'y' else: hist_val = 'n' urls = URLS(db=db, usehistory=hist_val, retmax=retmax, retmode='xml', field='TIAB', auto_gen=False) urls.build_info(['db']) urls.build_search(['db', 'usehistory', 'retmax', 'retmode', 'field']) urls.build_fetch(['db', 'retmode']) # Get current information about database being used self.get_db_info(urls.info) # Loop through all the erps #for ind, erp in enumerate(self.erps): for ind, lab in enumerate(self.labels): # Print out status print('Scraping words for: ', lab) # Initiliaze object to store data for current erp papers cur_erp = ERPData(lab, self.erps[ind]) # Set up search terms - add exclusions, if there are any if self.exclusions[ind][0]: #term_arg = '"' + erp[0] + '"' + 'NOT' + '"' + self.exclusions[ind][0] + '"' term_arg = comb_terms(self.erps[ind], 'or') + comb_terms( self.exclusions[ind], 'not') else: #term_arg = '"' + erp[0] + '"' term_arg = comb_terms(self.erps[ind], 'or') # Create the url for the erp search term url = urls.search + term_arg # Get page and parse page = self.req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Using history if use_hist: # ret_start = 0 ret_max = 100 # count = int(page_soup.find('count').text) web_env = page_soup.find('webenv').text query_key = page_soup.find('querykey').text # Update History cur_erp.update_history('Start Scrape') # while ret_start < count: # art_url = urls.fetch + '&WebEnv=' + web_env + '&query_key=' + query_key + \ '&retstart=' + str(ret_start) + '&retmax=' + str(ret_max) art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed') #new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # ret_start += ret_max # Without using history else: # Get all ids ids = page_soup.find_all('id') # Convert ids to string ids_str = _ids_to_str(ids) # Get article page art_url = urls.fetch + '&id=' + ids_str art_page = self.req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Update History cur_erp.update_history('Start Scrape') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = int(ids[ind].text) # Extract and add all relevant info from current articles to ERPData object cur_erp = self.extract_add_info(cur_erp, new_id, art) # Check consistency of extracted results cur_erp.check_results() cur_erp.update_history('End Scrape') # Save out and clear data cur_erp.save_n_clear() # Add the object with current erp data to results list self.add_results(cur_erp) # Set Requester object as finished being used self.req.close()