def _process_authors(author_list): """Extract and process author data. Parameters ---------- author_list : bs4.element.Tag AuthorList tag, which contains tags related to author data. Returns ------- out : list of tuple of (str, str, str, str) List of authors, each as (LastName, FirstName, Initials, Affiliation). """ # Pull out all author tags from the input authors = extract(author_list, 'Author', 'all') # Initialize list to return out = [] # Extract data for each author for author in authors: out.append((extract(author, 'LastName', 'str'), extract(author, 'ForeName', 'str'), extract(author, 'Initials', 'str'), extract(author, 'Affiliation', 'str'))) return out
def test_extract(): # Create a complex tag out = bs4.element.Tag(name='Out') inn1 = bs4.element.Tag(name='Inn') inn2 = bs4.element.Tag(name='Inn') inn1.append('words words') inn2.append('more words') out.append(inn1) out.append(inn2) # Test error - bad how with raises(ValueError): out_err = extract(out, 'Inn', 'bad') # Test how = 'raw' out_raw = extract(out, 'Inn', 'raw') assert type(out_raw) is bs4.element.Tag # Test how = 'str' out_str = extract(out, 'Inn', 'str') #TODO: Figure this out? Whats the return type? #assert isinstance(out_str, str) #assert out_str == 'words words' # Test how = 'all' out_all = extract(out, 'Inn', 'all') assert type(out_all) is bs4.element.ResultSet # Test with non-existent tag name out_none = extract(out, 'bad', 'raw') assert out_none is None
def _get_count(req, url): """Get the count of how many articles listed on search results URL. Parameters ---------- url : str URL to search with. Returns ------- count : int Count of the number of articles found. """ # Request page from URL page = req.get_url(url) page_soup = BeautifulSoup(page.content, 'lxml') # Get all count tags counts = extract(page_soup, 'count', 'all') try: count = int(counts[0].text) except IndexError: count = 0 return count
def _scrape_papers(req, art_url, cur_dat): """Scrape information for each article found for a given term. Parameters ---------- req : Requester() object Manages request art_url : str URL for the article to be scraped Returns ------- cur_dat : Data() object Object to store information for the current term. """ # Get page of all articles art_page = req.get_url(art_url) art_page_soup = BeautifulSoup(art_page.content, "xml") # Pull out articles articles = art_page_soup.findAll('PubmedArticle') # Loop through each article, extracting relevant information for ind, art in enumerate(articles): # Get ID of current article new_id = _process_ids(extract(art, 'ArticleId', 'all'), 'pubmed') # Extract and add all relevant info from current articles to Data object cur_dat = _extract_add_info(cur_dat, new_id, art) return cur_dat
def _get_db_info(req, info_url): """Calls EInfo to get info and status of db to be used for scraping. Parameters ---------- info_url : str URL to request db information from. Returns ------- db_info : dict Database information. """ # Get the info page and parse with BeautifulSoup info_page = req.get_url(info_url) info_page_soup = BeautifulSoup(info_page.content, 'lxml') # Set list of fields to extract from eInfo fields = ['dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate'] # Extract basic infomation into a dictionary db_info = dict() for field in fields: db_info[field] = extract(info_page_soup, field, 'str') return db_info
def _get_db_info(req, info_url): """Calls EInfo to get info and status of db to be used for scraping. Parameters ---------- info_url : str URL to request db information from. Returns ------- db_info : dict Database information. """ # Get the info page and parse with BeautifulSoup info_page = req.get_url(info_url) info_page_soup = BeautifulSoup(info_page.content, 'lxml') # Set list of fields to extract from eInfo fields = [ 'dbname', 'menuname', 'description', 'dbbuild', 'count', 'lastupdate' ] # Extract basic infomation into a dictionary db_info = dict() for field in fields: db_info[field] = extract(info_page_soup, field, 'str') return db_info
def test_extract(): """Test the extract function.""" # Create a complex tag out = bs4.element.Tag(name='Out') inn1 = bs4.element.Tag(name='Inn') inn2 = bs4.element.Tag(name='Inn') inn1.append('words words') inn2.append('more words') out.append(inn1) out.append(inn2) # Test error - bad how with raises(ValueError): out_err = extract(out, 'Inn', 'bad') # Test how = 'raw' out_raw = extract(out, 'Inn', 'raw') assert type(out_raw) is bs4.element.Tag # DROPPED CASE WITH MOVE TO PY35 # Test how = 'txt' #out_txt = extract(out, 'Inn', 'txt') #assert isinstance(out_txt, UnicodeType) #assert out_txt == unicode('words words') # Test how = 'str' out_str = extract(out, 'Inn', 'str') #TODO: Figure this out? Whats the return type? #assert isinstance(out_str, str) #assert out_str == 'words words' # Test how = 'all' out_all = extract(out, 'Inn', 'all') assert type(out_all) is bs4.element.ResultSet # Test with non-existent tag name out_none = extract(out, 'bad', 'raw') assert out_none is None
def _process_pub_date(pub_date): """Extract and process publication date data. Parameters ---------- pub_date : bs4.element.Tag PubDate tag, which contains tags with publication date information. Returns ------- year : int or None Year the article was published. month : str or None Month the article was published. """ # Extract year, convert to int if not None year = extract(pub_date, 'Year', 'str') year = int(year) if year else year # Extract month month = extract(pub_date, 'Month', 'str') return year, month
def _extract_add_info(cur_dat, new_id, art): """Extract information from article web page and add to Parameters ---------- cur_dat : Data() object Object to store information for the current term. new_id : int Paper ID of the new paper. art : bs4.element.Tag() object Extracted pubmed article. Returns ------- cur_dat : Data() object Object to store data from the current term. NOTES ----- Data extraction is all in try/except statements in order to deal with missing data, since fields may be missing. """ # Add ID of current article cur_dat.add_id(new_id) cur_dat.add_title(extract(art, 'ArticleTitle', 'str')) cur_dat.add_authors(_process_authors(extract(art, 'AuthorList', 'raw'))) cur_dat.add_journal(extract(art, 'Title', 'str'), extract(art, 'ISOAbbreviation', 'str')) cur_dat.add_words(_process_words(extract(art, 'AbstractText', 'str'))) cur_dat.add_kws(_process_kws(extract(art, 'Keyword', 'all'))) cur_dat.add_pub_date(_process_pub_date(extract(art, 'PubDate', 'raw'))) cur_dat.add_doi(_process_ids(extract(art, 'ArticleId', 'all'), 'doi')) # Increment number of articles included in Data cur_dat.increment_n_articles() return cur_dat