def parseBouquets(self, xmlnode): #print "parsing Bouquets", xmlnode list = [] for bouquet in xmlnode.getElementsByTagName('e2bouquet'): bref = urllib_unquote(bouquet.getElementsByTagName('e2bouquetreference')[0].childNodes[0].data) bname = urllib_unquote(bouquet.getElementsByTagName('e2bouquetname')[0].childNodes[0].data) #print "Bouquet",bref,bname list.append({'bname':bname, 'bref':bref, 'services':self.parseServices(bouquet)}) return list
def parseBouquets(self, xmlnode): # print "parsing Bouquets", xmlnode list = [] for bouquet in xmlnode.getElementsByTagName("e2bouquet"): bref = urllib_unquote(bouquet.getElementsByTagName("e2bouquetreference")[0].childNodes[0].data) bname = urllib_unquote(bouquet.getElementsByTagName("e2bouquetname")[0].childNodes[0].data) # print "Bouquet",bref,bname list.append({"bname": bname, "bref": bref, "services": self.parseServices(bouquet)}) return list
def parseServices(self, xmlnode): #print "parsing Services", xmlnode list = [] for service in xmlnode.getElementsByTagName('e2servicelist')[0].getElementsByTagName('e2service'): sref = urllib_unquote(service.getElementsByTagName('e2servicereference')[0].childNodes[0].data) sname = urllib_unquote(service.getElementsByTagName('e2servicename')[0].childNodes[0].data) sname = sname.replace(self.undefinded_tag, "<n/a>").replace(self.undefinded_and, "&") #print sref,sname list.append({'sref':sref, 'sname':sname}) return list
def _unquote_path(path): # MK1996 says, 'If a %xx encoded octet is encountered it is unencoded # prior to comparison, unless it is the "/" character, which has # special meaning in a path.' path = re.sub("%2[fF]", "\n", path) path = urllib_unquote(path) return path.replace("\n", "%2F")
def _update_counts(eids, resolve_url, s=None): """ Helper for get_references() Parameters ---------- s : Session The Requests Session eids : list List of eids but with a particular format e.g. ... TODO resolve_url : string This is a hardcoded value, eventually we'll pull this from the class Returns ------- """ payload = {'_updateCitedBy': ''.join(eids)} # r = s.get(resolve_url, params=payload) r = _selenium_connect(resolve_url) # TODO: Check for 200 data = urllib_unquote(r) # myXabsCounts['citedBy_26']='Citing Articles (41)'; cited_by_results = re.findall("myXabsCounts\['citedBy_(\d+)'\]='[^\(]+\((\d+)", data) # TODO: parse response # ????? Why is the order scrambled - this seems to be on their end ...???? ''' NOTE: This is now Citing Articles, references to Scopus have been dropped myXabsCounts['citedBy_16']='Cited By in Scopus (128)'; myXabsCounts['citedBy_15']='Cited By in Scopus (25)'; myXabsCounts['citedBy_1']='Cited By in Scopus (2)'; myXabsCounts['citedBy_3']='Cited By in Scopus (29)'; ''' # TODO: go through refs and apply new values ... return cited_by_results
def get_references(input, verbose=False): """ This function gets references for a Sciencedirect URL that is of the form: http://www.sciencedirect.com/science/article/pii/################ e.g. http://www.sciencedirect.com/science/article/pii/0006899387903726 Implementation Notes: --------------------- From what I can tell this information is not exposed via the Elsevier API. In order to minimize complexity, the mobile site is requested: via a cookie. Code Layout and Algorithm Notes: -------------------------------- """ # TODO: Make this a class reference parser # *** These tags are mobile-site specific # When we don't have proper access rights, this is present in the html GUEST_TAG_TUPLE = ("li", {"id": "menuGuest"}) # Entries are "li" tags with classes of the form: # article-reference-article # article-reference-other-ref REFERENCE_TAG_TUPLE = ("li", {"class": re.compile('article-reference-*')}) # This is the URL to the page that contains the document info, including # reference material BASE_URL = _SD_URL + '/science/article/pii/' # This URL was found first via Fiddler, then via closer inspection of the script # 'article_catalyst.js' under sciencedirect.com/mobile/js in the function # resolveReferences REF_RESOLVER_URL = _SD_URL + '/science/referenceResolution/ajaxRefResol' # Return the BeautifulSoup result, the requests session, and the requests response if _is_url(input): pii = _extract_pii(input) else: pii = input sess = requests.Session() if verbose: print('Requesting main page for pii: %s' % pii) resp = sess.get(BASE_URL + pii, cookies={'Site': 'Mobile'}) # Step 2 - Get the reference tags soup = BeautifulSoup(resp.text) reference_section = soup.find("ol", {"class": "article-references"}) if reference_section is None: # Then we might be a guest. In other words, we might not have sufficient # privileges to access the data we want. Generally this is protected via # IP mask. When I'm working from home I need to VPN into work so # that I can access the data :/ print("reference_section is None") temp = soup.find(*GUEST_TAG_TUPLE) if temp is None: # We might have no references ... (Doubtful) raise ParseException("References were not found ..., code error likely") else: raise InsufficientCredentialsException( "Insufficient access rights to get referencs, requires certain IP addresses (e.g. university based IP)") ref_tags = reference_section.find_all(*REFERENCE_TAG_TUPLE) n_refs = len(ref_tags) if n_refs == 0: return None # Step 3 - Resolve reference links # -------------------------------------------------------------------------- # The returned html code contains javascript which returns more information # about each reference, such as: # # - links to the full text # - DOI # Step 3.1 - Make the request for the information # -------------------------------------------------------------------------- # We need the eid of the current entry, it is of the form: # # SDM.pm.eid = "1-s2.0-0006899387903726" # # * I think this entry gets deleted after the requests so it may not be # visible if looking for it in Chrome. match = re.search('SDM\.pm\.eid\s*=\s*"([^"]+)"', resp.text) #eid = match.group(1) # This list comes from the resolveReferences function in article_catalyst.js payload = { '_pii': pii, '_refCnt': n_refs, '_docType': 'article', # yikes, this might change ... '_refRangeStart': '1', '_refRangeCount': str(n_refs)} # This is normally in sets of 20's ... # I'm not sure if it is important to limit this. The browser then # makes a request fromr 1 count 20, 21 count 20, 41 count 20 etc, # It always goes by 20 even if there aren't 20 left if verbose: print('Requesting reference links') r2 = sess.get(REF_RESOLVER_URL, params=payload) # Step 3.2 - Parse the returned information into single entries # -------------------------------------------------------------------------- # This could probably be optimized in terms of execution time. We basically # get back a single script tag. Inside is some sort of hash map for links # for each reference. # # The script tag is of the form: # myMap['bibsbref11']['refHtml']= "<some html stuffs>"; # myMap['bibsbref11']['absUrl']= "http://www.sciencedirect.com/science/absref/sd/0018506X7790068X"; # etc. # # - Each entry is quite long. # - Normally contains html # - can be empty i.e. myMap['bibsbref11']['refHtml'] = ""; # - the refHtml is quite interesting # - the absolute url is not always present (and currently not parsed) more_soup = BeautifulSoup(r2.text) script_tag = more_soup.find('script') # We unquote the script text as it is transmitted with characters escaped # and we want the parsed data to contain the non-escaped text # # We might eventually want to move this to being after the regular expression ... script_text = urllib_unquote(script_tag.text) ref_match_result = re.findall("myMap\['bibsbref(\d+)'\]\['refHtml'\]=\s?" + '"([^"]*)";', script_text) # Tokens: # 0 - the # from bibsbref# # 1 - the html content from the 'refHtml' entry # # NOTE: We don't really use the #, so we might remove the () around # \d+ which would shift the index from 1 to 0 if verbose: print('Creating reference objects') if len(ref_match_result) > 0: zipped = zip(ref_tags, ref_match_result, range(n_refs)) ref_objects = [ScienceDirectRef(ref_tag, ref_link_info[1], ref_id) for ref_tag, ref_link_info, ref_id in zipped] else: zipped = zip(ref_tags, range(n_refs)) ref_objects = [ScienceDirectRef(ref_tag, ref_id) for ref_tag, ref_id in zipped] # Step 4: # -------------------------------------------------------------------------- # TODO: Improve documentation for this step if verbose: print('Retrieving Scopus Counts') ref_scopus_eids = [] # The Scopus IDs of the references to resolve # but with a particular formatting ... ref_count = 0 # Number of references we haven't resolved ref_count_list = [] # NOTE: Browser requests these in the reverse order ... for ref_id, ref in enumerate(ref_objects): if ref._data_sceid is not None: ref_scopus_eids.append(ref._data_sceid + ',' + str(ref_id + 1) + '~') ref_count += 1 # If we've got enough, then update the counts # The 20 may be arbitrary but it was what was used in original JS if ref_count > 20: ref_count_list += _update_counts(sess, ref_scopus_eids, REF_RESOLVER_URL) ref_count = 0 ref_scopus_eids = [] # Get any remaining reference counts if ref_count != 0: ref_count_list += _update_counts(sess, ref_scopus_eids, REF_RESOLVER_URL) # Take the raw data and set the citation count for each object for ref_tuple in ref_count_list: ref_id = int(ref_tuple[0]) - 1 ref_count = int(ref_tuple[1]) ref_objects[ref_id].scopus_cite_count = ref_count # All done! # --------- return ref_objects
def __init__(self, ref_tags, ref_id, ref_link_info=None): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_link_info: str Html, not yet souped. Contains extra information such as links to a pdf (if known) and other goodies ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ super().__init__() # Reference Bibliography Section: # ------------------------------- # Example str: <span class="r_volume">Volume 47</span> self.ref_id = ref_id + 1 # Input is 0 based self.title = findValue(ref_tags, 'li', 'reference-title', 'class') all_authors = ref_tags.find_all('span', {'class' : 'reference-author'}) self.authors = [x.text for x in all_authors] #self.authors = findValue(ref_tags, 'li', 'reference-author', 'class') # NOTE: We can also get individual authors if we would like. # # Search would be on: # <span class="reference-author"> # instead of on the list. # Unfortunately r_publication is found both for the title and for # the publication. Some custom code is needed to first go into a r_series # span and then to the publication self.publication = None r_source_tag = ref_tags.find('span', {'class': 'r_series'}) if r_source_tag is not None: pub_tag = r_source_tag.find('span', {'class': 'r_publication'}) if pub_tag is not None: self.publication = pub_tag.text.replace('\\xa0', ' ') temp_volume = findValue(ref_tags, 'span', 'r_volume', 'class') if temp_volume is None: self.volume = None else: self.volume = temp_volume.replace('Volume ', '') self.issue = findValue(ref_tags, 'span', 'r_issue', 'class') self.series = findValue(ref_tags, 'span', 'r_series', 'class') self.date = findValue(ref_tags, 'span', 'r_pubdate', 'class') temp_pages = findValue(ref_tags, 'span', 'r_pages', 'class') if temp_pages is None: self.pages = None else: # TODO: is the unicode working properly ??? 576–577 and ideally 576-577 self.pages = temp_pages.replace('pp. ', '') # Reference Meta Section: # ----------------------- self.scopus_link = None self.doi = None self._data_sceid = None self.pii = None self.pdf_link = None self.scopus_cite_count = None self.aps_full_text = None if ref_link_info is not None: link_soup = BeautifulSoup(ref_link_info) # Each section is contained a div tag with the class boxLink, although # some classes have more text in the class attribute, thus the *) #box_links = link_soup.find_all('div', {'class': re.compile('boxLink*')}) box_links = link_soup.find_all('div', {'class' : 'boxLink'}) # This code is a bit hard to read but each 'if statement' shows what # is needed in order to resolve the item. for box_link in box_links: div_class_values = box_link.attrs['class'] link_tag = box_link.find('a') if 'SC_record' in div_class_values: # "View Record in Scopus" # They changed to returning a full link # I should really use a library to resolve based on both # although the input should be the current page, not the base # self.scopus_link = _SD_URL + link_tag.attrs['href'] self.scopus_link = link_tag.attrs['href'] elif 'class' in link_tag.attrs and 'S_C_pdfLink' in link_tag.attrs['class']: # Link to PDF self.pdf_link = _SD_URL + link_tag.attrs['href'] elif 'class' in link_tag.attrs and 'cLink' in link_tag.attrs['class']: # Article Link temp = link_tag.attrs['href'] match = re.search('/pii/(.*)', temp) self.pii = match.group(1) self.doi = self.doi_from_crossref(self.pii) elif 'CrossRef' in box_link.text: # CrossRef link provides DOI as href # In old code it was a query parameter but this # has now moved to a "data-url" attribute temp = link_tag.attrs['href'] # http://dx.doi.org/10.1037%2Fh0075243 match = re.search('dx\.doi\.org/(.*)', temp) # Unquote removes %xx escape characters self.doi = urllib_unquote(match.group(1)) elif "Purchase" in box_link.text: # New link added to Purchase pdf. It was throwing errors pass elif 'aps full text' in box_link.text.lower(): self.aps_full_text = link_tag.attrs['href'] else: span_tag = link_tag.find('span') if 'citedBy_' in span_tag.attrs['class']: # Cited By Scopus Count # # NOTE: Apparently the citedByScopus doesn't get added # until later so we need to look for the scan tag. Let's # do this only if all else fails. self._data_sceid = span_tag.attrs['data-sceid'] else: raise Exception('Failed to match link') # Finally, update if it is not an article tag_class = ref_tags.get('class')[0] if tag_class == 'article-reference-other-ref': publication = ref_tags.find('em') if publication is not None: self.publication = publication.text self.title = ref_tags.text
def __init__(self, ref_tags, ref_id): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ # Reference Bibliography Section: #-------------------------------- self.ref_id = ref_id + 1 # Input is 0 indexed self.title = findValue(ref_tags, 'span', 'articleTitle', 'class') authorlist = ref_tags.find_all('span', 'author', 'class') self.authors = [x.text for x in authorlist] # Note: we can also get individual authors if we would like. # # On Wiley, each reference author is given a separate <span> tag with the class 'author' # so individual authors can be extracted # self.publication = findValue(ref_tags, 'span', 'journalTitle', 'class') self.volume = findValue(ref_tags, 'span', 'vol', 'class') self.date = findValue(ref_tags, 'span', 'pubYear', 'class') firstp = findValue(ref_tags, 'span', 'pageFirst', 'class') lastp = findValue(ref_tags, 'span', 'pageLast', 'class') if (firstp is not None) and (lastp is not None): self.pages = firstp + '-' + lastp else: self.pages = None # Reference Meta Section: #------------------------------ self.crossref = None self.pubmed = None self.pubmed_id = None self.doi = None self.citetimes = None self.cas = None self.abstract = None self.pdf_link = None self.ref_references = None # External links (i.e. PubMed, CrossRef, CAS) are kept in a ul tag # Internal links (i.e. direct to abstract, references, etc.) are in a div # Need to check for both links = ref_tags.find('ul', 'externalReferences', 'class') if links is None: links = ref_tags.find('div', 'internalReferences', 'class') # Only proceed if either internal or external references were found if links is not None: links = links.find_all('li') # Check against all possible link options and save links. # href links are appended onto base URL ('http://onlinelibrary.wiley.com') # for link in links: label = link.text.lower() href = link.find('a', href=True)['href'] href = urllib_quote(href) if 'crossref' in label: self.doi = href[href.find('10.'):] # Grab everything starting with '10.' in link if self.doi == -1: self.doi = None self.doi = urllib_unquote(self.doi) # CrossRef link is in the form of _WY_URL/resolve/reference/XREF?id=10.####### self.crossref = _WY_URL + urllib_unquote(href) elif 'pubmed' in label: self.pubmed_id = re.search('[^id=]+$',href).group(0)[1:] # the [1:] is to get rid of leading '=' self.pubmed_id = urllib_unquote(self.pubmed_id) self.pubmed = _WY_URL + urllib_unquote(href) elif 'web ' in label: self.citetimes = re.search('[^: ]+$',label).group(0) elif label in ('cas', 'cas,'): self.cas = _WY_URL + urllib_unquote(href) elif 'abstract' in label: self.abstract = _WY_URL + urllib_unquote(href) elif 'pdf' in label: self.pdf_link = _WY_URL + urllib_unquote(href) elif 'references' in label: self.ref_references = _WY_URL + urllib_unquote(href)
def unquote(uri): """Specialized unquote that uses UTF-8 for parsing.""" uri = uri.encode("ascii") unquoted = urllib_unquote(uri) return unquoted.decode("utf-8")
def __init__(self, ref_tags, ref_id, ref_link_info=None): """ Parameters: ----------- ref_tags: bs4.element.Tag Html tags as soup of the reference. Information provided is that needed in order to form a citation for the given reference. ref_link_info: str Html, not yet souped. Contains extra information such as links to a pdf (if known) and other goodies ref_id: int The id of the reference as ordered in the citing entry. A value of 1 indicates that this object is the first reference in the bibliography. """ super().__init__() # Reference Bibliography Section: # ------------------------------- ref = ref_tags.find('ul', {'class': 'reference'}) if ref is None: import pdb pdb.set_trace() # Example str: <span class="r_volume">Volume 47</span> self.ref_id = ref_id + 1 # Input is 0 based self.title = findValue(ref, 'li', 'title', 'class') all_authors = ref_tags.find('li', {'class' : 'author'}) author_text = all_authors.text author_list = author_text.split(', ') self.authors = [] for x in author_list: x = x.strip() self.authors.append(x) # Unfortunately r_publication is found both for the title and for # the publication. Some custom code is needed to first go into a r_series # span and then to the publication self.publication = '' self.volume = '' self.date = '' self.pages = '' source = ref.find('li', {'class': 'source'}) source = source.text source_parts = source.split(', ') # Attempt to parse publication into sections found = 0 for elt in source_parts: # This first part is looking for the date, which is written (xxxx). # It is also saved as volume because they are within the same element # and sometimes the volume is also in parentheses. if '(' in elt: found = 1 self.volume = elt self.date = elt # Until the date/volume part is found, save everything before as publication. # This can either be simple like 'Nature' or more complex, with a description # and location, in which case, it would be multiple elements in source_parts. if found == 0: self.publication = self.publication + elt # Find and save the pages portion, usually notated either with 'p. ' or 'pp. ' if 'p. ' in elt: self.pages = elt self.pages = self.pages.replace('p. ', '') self.pages = self.pages.replace('p', '') # This checks if the parsing didn't work. All of the information # should still be retained even if not parsed correctly. if self.date == '' and self.pages == '': self.publication = source # Reference Meta Section: # ----------------------- self.scopus_link = None self.doi = None self._data_sceid = None self.pii = None self.pdf_link = None self.scopus_cite_count = None self.aps_full_text = None if ref_link_info is None: link_soup = ref.find('li', {'class': 'external'}) else: link_soup = BeautifulSoup(ref_link_info) if link_soup is not None: # Each section is contained a div tag with the class boxLink, although # some classes have more text in the class attribute, thus the *) #box_links = link_soup.find_all('div', {'class': re.compile('boxLink*')}) box_links = link_soup.find_all('div', {'class' : 'boxLink'}) # This code is a bit hard to read but each 'if statement' shows what # is needed in order to resolve the item. for box_link in box_links: div_class_values = box_link.attrs['class'] link_tag = box_link.find('a') if 'SC_record' in div_class_values: # "View Record in Scopus" # They changed to returning a full link # I should really use a library to resolve based on both # although the input should be the current page, not the base # self.scopus_link = _SD_URL + link_tag.attrs['href'] self.scopus_link = link_tag.attrs['href'] elif 'class' in link_tag.attrs and 'S_C_pdfLink' in link_tag.attrs['class']: # Link to PDF self.pdf_link = _SD_URL + link_tag.attrs['href'] elif 'class' in link_tag.attrs and 'cLink' in link_tag.attrs['class']: # Article Link temp = link_tag.attrs['href'] match = re.search('/pii/(.*)', temp) self.pii = match.group(1) self.doi = self.doi_from_crossref(self.pii) elif 'CrossRef' in box_link.text: # CrossRef link provides DOI as href # In old code it was a query parameter but this # has now moved to a "data-url" attribute temp = link_tag.attrs['href'] # http://dx.doi.org/10.1037%2Fh0075243 match = re.search('dx\.doi\.org/(.*)', temp) # Unquote removes %xx escape characters self.doi = urllib_unquote(match.group(1)) elif "Purchase" in box_link.text: # New link added to Purchase pdf. It was throwing errors pass elif 'aps full text' in box_link.text.lower(): self.aps_full_text = link_tag.attrs['href'] else: span_tag = link_tag.find('span') if 'citedBy_' in span_tag.attrs['class']: # Cited By Scopus Count # # NOTE: Apparently the citedByScopus doesn't get added # until later so we need to look for the scan tag. Let's # do this only if all else fails. self._data_sceid = span_tag.attrs['data-sceid'] else: raise Exception('Failed to match link') # Finally, update if it is not an article ref_tag_ul = ref_tags.find('ul') tag_class = ref_tag_ul.get('class') if tag_class is not None: tag_class = tag_class[0] if tag_class == 'article-reference-other-ref': publication = ref_tags.find('em') if publication is not None: self.publication = publication.text self.title = ref_tags.text
def unquote(uri): """Specialized unquote that uses UTF-8 for parsing.""" uri = uri.encode('ascii') unquoted = urllib_unquote(uri) return unquoted.decode('utf-8')