def __init__(self, soup, verbose=False): # Get entry content information mainContent = soup.find('div', {'class' : 'ArticleHeader'}) if mainContent is None: raise ParseException('Unable to find main content of page') # Metadata: # -------------- self.title = findValue(mainContent, 'h1', 'ArticleTitle', 'class').title() self.publication = findValue(mainContent, 'span', 'JournalTitle', 'class') # Two dates are given: original publication date and # online publication date. This returns the original journal pub date. yearwrapper = mainContent.find('span', {'class' : 'ArticleCitation_Year'}) self.date = yearwrapper.find('time').text self.year = self.date[-4:] self.volume = findValue(mainContent, 'span', 'ArticleCitation_Volume', 'class') # SpringerLink doesn't seem to list article pages self.pages = None # SpringerLink keeps keywords below the abstract, separate from header info keybox = soup.find('div', {'class' : 'KeywordGroup'}) if keybox is None: raise ParseException('Unable to find keywords') wordlist = keybox.find_all('span', {'class' : 'Keyword'}) self.keywords = [w.text for w in wordlist] # DOI Retrieval: # -------------- # This might be more reliable than assuming we have the DOI in the title self.doi = findValue(mainContent, 'p', 'article-doi', 'class') doi_startindex = self.doi.find('10.') self.doi = self.doi[doi_startindex:] # to get rid of whitespace at the beginning # Authors: # -------- # Find list items within the ordered list with id 'authors' # Need to find only classless li's so that it doesn't also retrieve the child li's corresponding # to author affiliations at this stage. authorList = mainContent.find('ul', {'class' : 'authors'}).find_all('li', {'class' : None}) self.authors = [SpringerAuthor(x) for x in authorList] '''
def __init__(self, ref_tags, ref_id): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ super().__init__() self.ref_tags = ref_tags # Reference Bibliography Section: # -------------------------------- self.ref_id = ref_id + 1 # Input is 0 indexed self.title = findValue(ref_tags, 'span', 'articleTitle', 'class') authorlist = ref_tags.find_all('span', 'author', 'class') self.authors = [x.text for x in authorlist] self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class') self.volume = findValue(ref_tags, 'span', 'vol', 'class') self.date = findValue(ref_tags, 'span', 'pubYear', 'class') firstp = findValue(ref_tags, 'span', 'pageFirst', 'class') lastp = findValue(ref_tags, 'span', 'pageLast', 'class') if (firstp is not None) and (lastp is not None): self.pages = firstp + '-' + lastp # Reference Meta Section: # ------------------------------ self.crossref = None self.pubmed = None self.pubmed_central = None self.doi = None # External links (i.e. PubMed, CrossRef) are kept in a span tag links = ref_tags.find('span', 'Occurrences', 'class') # Only proceed if either internal or external references were found if links is not None: links = links.find_all('span', {'class' : 'Occurrence'}) # Check against all possible link options and save links. # # NOTE: links are returned URL encoded (using urllib.quote(), but DOI # and PubMed IDs are not encoded. This means that if extracting the DOI # from one of the returned URLs, it needs to be first unquoted. # for link in links: href = link.find('a', href=True)['href'] if 'OccurrenceDOI' in link['class']: self.crossref = href elif 'OccurrencePID' in link['class']: self.pubmed = href elif 'OccurrencePMCID' in link['class']: self.pubmed_central = href
def __init__(self, ref_tags, ref_id): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ # Reference Bibliography Section: #-------------------------------- self.ref_id = ref_id + 1 # Input is 0 indexed self.title = findValue(ref_tags, 'span', 'articleTitle', 'class') authorlist = ref_tags.find_all('span', 'author', 'class') self.authors = [x.text for x in authorlist] # Note: we can also get individual authors if we would like. # # On Wiley, each reference author is given a separate <span> tag with the class 'author' # so individual authors can be extracted # self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class') self.volume = findValue(ref_tags, 'span', 'vol', 'class') self.date = findValue(ref_tags, 'span', 'pubYear', 'class') firstp = findValue(ref_tags, 'span', 'pageFirst', 'class') lastp = findValue(ref_tags, 'span', 'pageLast', 'class') if (firstp is not None) and (lastp is not None): self.pages = firstp + '-' + lastp else: self.pages = None # Reference Meta Section: #------------------------------ self.crossref = None self.pubmed = None self.pubmed_id = None self.doi = None self.citetimes = None self.cas = None self.abstract = None self.pdf_link = None self.ref_references = None # External links (i.e. PubMed, CrossRef, CAS) are kept in a ul tag # Internal links (i.e. direct to abstract, references, etc.) are in a div # Need to check for both links = ref_tags.find('ul', 'externalReferences', 'class') if links is None: links = ref_tags.find('div', 'internalReferences', 'class') # Only proceed if either internal or external references were found if links is not None: links = links.find_all('li') # Check against all possible link options and save links. # href links are appended onto base URL ('http://onlinelibrary.wiley.com') # for link in links: label = link.text.lower() href = link.find('a', href=True)['href'] href = urllib_quote(href) if 'crossref' in label: self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link if self.doi == -1: self.doi = None self.doi = urllib_unquote(self.doi) # CrossRef link is in the form of _WY_URL/resolve/reference/XREF?id=10.####### self.crossref = _WY_URL + urllib_unquote(href) elif 'pubmed' in label: self.pubmed_id = re.search('[^id=]+$',href).group(0)[1:] # the [1:] is to get rid of leading '=' self.pubmed_id = urllib_unquote(self.pubmed_id) self.pubmed = _WY_URL + urllib_unquote(href) elif 'web ' in label: self.citetimes = re.search('[^: ]+$',label).group(0) elif label in ('cas', 'cas,'): self.cas = _WY_URL + urllib_unquote(href) elif 'abstract' in label: self.abstract = _WY_URL + urllib_unquote(href) elif 'pdf' in label: self.pdf_link = _WY_URL + urllib_unquote(href) elif 'references' in label: self.ref_references = _WY_URL + urllib_unquote(href)
def __init__(self, soup, verbose=False): # Get entry content information mainContent = soup.find('div', {'id' : 'mainContent'}) if mainContent is None: raise ParseException('Unable to find main content of page') # Check for 'Page Not Found' error404 = mainContent.find('div', {'id' : 'error'}) if error404 is not None: raise ParseException('Article was not found.') # Metadata: #--------------- self.title = findValue(mainContent, 'span', 'mainTitle', 'class').title() if self.title is not None: self.title = self.title.title() self.publication = findValue(mainContent, 'h2', 'productTitle', 'id') # For old journal issues, two dates are given: original publication date and # online publication date. This returns the original journal pub date. self.date = findValue(mainContent, 'span', 'issueDate', 'id') self.year = self.date[-4:] vol = findValue(mainContent, 'span', 'volumeNumber', 'id') vol = vol.lower().replace('volume ', '') issue = findValue(mainContent, 'span', 'issueNumber', 'id') issue = issue.lower().replace('issue ', '') self.volume = vol self.issue = issue self.pages = findValue(mainContent, 'span', 'issuePages', 'id') self.pages = self.pages[6:] # to get rid of 'pages: ' at the beginning # Keywords and Abstract: #---------- productContent = soup.find('div', {'id' : 'productContent'}) keybox = productContent.find('div', {'class' : 'keywordLists'}) if keybox is None: self.keywords = None else: wordlist = keybox.find_all('li') self.keywords = [w.text for w in wordlist] abstract_section = productContent.find('div', {'id' : 'abstract'}) if abstract_section is not None: self.abstract = abstract_section.text else: self.abstract = None # DOI Retrieval: #--------------- # This might be more reliable than assuming we have the DOI in the title self.doi = findValue(mainContent, 'p', 'doi', 'id') self.doi = self.doi[5:] # to get rid of 'DOI: ' at the beginning # Authors: #--------- # Find list items within the ordered list with id 'authors' authorList = mainContent.find('ol', {'id':'authors'}).find_all('li') self.authors = [WileyAuthor(x) for x in authorList] # Find all list items with the 'affiliation' class # The content is kept in a <p> tag within each list item aff_tags = mainContent.find_all('li', {'class' : 'affiliation'}) self.affiliations = [a.find('p').text for a in aff_tags] # Clean up strings - Not sure if necessary for a in range(len(self.affiliations)): self.affiliations[a] = self.affiliations[a].replace(', and ', '') self.affiliations[a] = self.affiliations[a].replace(' ', '') corr = mainContent.find('p', {'id' : 'correspondence'}) if corr is not None: email = findValue(corr, 'a', 'Link to email address', 'title') else: email = '' # Assign affiliations to authors for author in self.authors: author.populate_affiliations(self.affiliations) if author.contact == 1: author.email = email
def __init__(self, ref_tags, ref_id): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ super().__init__() # Reference Bibliography Section: #-------------------------------- self.ref_id = ref_id + 1 # Input is 0 indexed self.title = findValue(ref_tags, 'span', 'title', 'class') authorlist = ref_tags.find_all('span', {'class' : 'author'}) self.authors = [x.text for x in authorlist] # Note: we can also get individual authors if we would like. # # Each reference author is given a separate <span> tag with the class 'author' # so individual authors can be extracted # self.publication = findValue(ref_tags, 'span', 'source-title', 'class') self.volume = findValue(ref_tags, 'span', 'volume', 'class') self.date = findValue(ref_tags, 'span', 'year', 'class') firstp = findValue(ref_tags, 'span', 'start-page', 'class') lastp = findValue(ref_tags, 'span', 'end-page', 'class') if (firstp is not None) and (lastp is not None): self.pages = firstp + '-' + lastp else: self.pages = None # Reference Meta Section: #------------------------------ self.crossref = None self.pubmed = None self.doi = None self.cas = None self.isi = None self.ads = None # All links are kept in a ul tag with the class 'cleared' links = ref_tags.find('ul', {'class' : 'cleared'}) # Only proceed if links are found if links is not None: links = links.find_all('li') # Check against all possible link options and save links. # for link in links: label = link.text.lower() href = link.find('a', href=True)['href'] if 'article' in label: self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link if self.doi == -1: self.doi = None # Called 'Article' link, but url is http://dx.doi.org/10.###### # Redirects to article page self.crossref = href elif 'pubmed' in label: self.pubmed = href elif 'cas' in label: self.cas = href elif 'isi' in label: self.isi = href elif 'ads' in label: self.ads = href
def __init__(self, soup, verbose=False): super().__init__() # Get entry content information content = soup.find('div', {'id' : 'content'}) mainContent = content.find('header') if mainContent is None: raise ParseException('Unable to find main content of page') # Metadata: # --------- self.title = findValue(mainContent, 'h1', 'article-heading', 'class') # Isolate the entry citation line citation = mainContent.find('dl', {'class' : 'citation'}) self.publication = findValue(citation, 'dd', 'journal-title', 'class') # For old journal issues, two dates are given: original publication date and # online publication date. This returns the original journal pub date. self.date = citation.find('time').text[1:-1] # Get rid of parentheses surrounding it self.year = self.date[-4:] # Get rid of commas and whitespace from volume vol = findValue(citation, 'dd', 'volume', 'class').replace(',', '') self.volume = vol.replace('\n', '') self.pages = findValue(citation, 'dd', 'page', 'class') # Nature pages for some reason don't list keywords... self.keywords = None # DOI Retrieval: # -------------- # This might be more reliable than assuming we have the DOI in the title self.doi = findValue(citation, 'dd', 'doi', 'class') self.doi = self.doi[4:] # to get rid of 'DOI:' at the beginning # Abstract: # --------- self.abstract = '' abstract_section = soup.find('div', {'id' : 'abstract'}) abstract_content = abstract_section.find('p') if abstract_content is not None: self.abstract = self.abstract + abstract_content.text # Authors: # -------- # Find list items within the ordered list with id 'authors' authorList = mainContent.find('ul', {'class':'authors'}).find_all('li') self.authors = [NatureAuthor(x) for x in authorList] # Get list of affiliations from bottom of page author_info = soup.find('div', {'id' : 'author-information'}) aff_section = author_info.find('ol', {'class' : 'affiliations'}) aff_tags = aff_section.find_all('li', recursive=False) self.affiliations = [a.find('h3').text for a in aff_tags] corr = author_info.find('a', {'class' : 'contact'}) corr_name = corr.text email = _NT_URL + corr['href'] # Assign affiliations to authors for author in self.authors: author.populate_affiliations(self.affiliations) if author.name == corr_name: author.email = email
def __init__(self, ref_tags, ref_id): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ super().__init__() self.ref_tags = ref_tags # Reference Bibliography Section: #-------------------------------- self.ref_id = ref_id + 1 # Input is 0 indexed self.volume = None self.pages = None all_text = ref_tags.find_all(text=True) self.citation = all_text[1] # 'all_text' is a list of the text segments within each citation. # If it is a short list, it means that the citation is likely a book, # and doesn't include page numbers, PMID, DOI, etc. if len(all_text) > 5: metadata = all_text[3] metadata = metadata[2:] # Get rid of leading '; ' divider = metadata.find(':') # This divides volume number from page range self.volume = metadata[0:divider] self.pages = metadata[divider+1:metadata.find(';')] self.date = findValue(ref_tags, 'span') # Reference Link Section: #------------------------------ self.crossref = None self.pubmed = None self.pubmed_id = None self.doi = None self.web_of_science = None # External links (i.e. PubMed, CrossRef) are kept in <a> tags, # while the IDs are conveniently kept in <pub-id> tags links = ref_tags.find_all('a') ids = ref_tags.find_all('pub-id') for link in ids: id_type = link['pub-id-type'] if id_type == 'pmid': self.pubmed_id = link.text elif id_type == 'doi': self.doi = link.text if links is not None: for link in links: href = link['href'][1:] # Get rid of leading '/' text = link.text.lower() if 'crossref' in text: self.crossref = _TF_URL + href elif 'pubmed' in text: self.pubmed = _TF_URL + href elif 'science' in text: self.web_of_science = _TF_URL + href
def __init__(self, soup, verbose=False): super().__init__() # Get entry content information mainContent = soup.find('div', {'id': 'journal_content'}) if mainContent is None: mainContent = soup.find('div', {'id': 'pb-page-content'}) if mainContent is None: raise ParseException('Unable to find main content of page') # Metadata: # --------- titlebox = mainContent.find('div', {'class': 'description'}) if titlebox is not None: self.title = titlebox.find('h1').text.title() else: self.title = None import pdb pdb.set_trace() # This box contains the publication name as well as Volume and Issue pubbox = mainContent.find('div', {'class': 'borderedmodule'}) pubbox = pubbox.find('td') self.publication = findValue(pubbox, 'h2') if self.publication is not None: self.publication = self.publication.strip() # Parsing out the integer values of the volume and issue vol_issue = pubbox.find('h3') if vol_issue is None: raise ParseException('Unable to find volume and issue data') else: vol_issue = vol_issue.text issue_index = vol_issue.find('Issue') # If an issue number is listed, extract it if issue_index != -1: vol_text = vol_issue[0:issue_index] all_issue_text = vol_issue[issue_index:] issue_text = all_issue_text[0:all_issue_text.find(',')] issue_num_text = [x for x in issue_text if x.isdigit()] self.issue = ''.join(issue_num_text) else: vol_text = vol_issue self.issue = None vol_num_text = [x for x in vol_text if x.isdigit()] self.volume = ''.join(vol_num_text) # Two dates are given: original publication date and # online publication date. This returns the original journal pub date. datebox = mainContent.find('div', {'class' : 'articleDates'}) if datebox is None: raise ParseException('Unable to find publishing dates') alldates = datebox.find_all('li') full_date_text = alldates[-1].text date_index = full_date_text.find('Published online: ') if date_index > -1: date = full_date_text[(date_index + 18):] else: date = '' self.date = date self.year = self.date[-4:] # Keywords # TaylorFrancis keeps keywords below the abstract, separate from header info abstract_section = mainContent.find('div', {'class' : 'abstract'}) keybox = abstract_section.find('ul', {'class' : 'keywords'}) if keybox is None: raise ParseException('Unable to find keywords') wordlist = keybox.find_all('li') self.keywords = [w.text[0:w.text.find(',')] for w in wordlist] metabox = mainContent.find('div', {'class' : 'doiMeta'}) self.pages = findValue(mainContent, 'div', label_type='class', label_name='pageRange') # DOI Retrieval: # -------------- # This might be more reliable than assuming we have the DOI in the title self.doi = findValue(metabox, 'dd') doi_startindex = self.doi.find('10.') self.doi = self.doi[doi_startindex:] # to get rid of whitespace at the beginning # Authors: # -------- # Find list items within the ordered list with id 'authors' # Need to find only classless li's so that it doesn't also retrieve the child li's corresponding # to author affiliations at this stage. authorList = metabox.find_all('span', {'class' : 'hlFld-ContribAuthor'}) self.authors = [TaylorFrancisAuthor(x) for x in authorList] # Find the list of affiliations from the tabbed module at the bottom of the page tabModule = mainContent.find('div', {'id' : 'tabModule'}) aff_list = tabModule.find('ul', {'class' : 'affiliations'}) affs = aff_list.find_all('li') affiliations = [] for aff in affs: affiliations.append(aff.text[1:]) # Get rid of the leading superscript letter # Assign affiliations to authors for author in self.authors: author.populate_affiliations(affiliations)