def _scrape_urls(self, canonical_pmid): """ return a list of urls which might be a suitable provider from the NCBI page """ ncbi_url = "http://www.ncbi.nlm.nih.gov/pubmed/" + canonical_pmid[5:] resp = util.http_get(ncbi_url) if resp.status_code != 200: return [] soup = BeautifulSoup(resp.text) # first look for the canonical link under the "icons" class div icons = soup.find(class_="icons") if icons is not None: anchors = icons.find_all("a") if len(anchors) > 0: return [anchors[0]['href']] # if we don't find an "icons" div, then we need to scrape from the "linkoutlist" linkout = soup.find_all(class_="linkoutlist") if len(linkout) == 0: return [] anchors = linkout[0].find_all("a") if len(anchors) == 0: return [] urls = [] for a in anchors: urls.append(a['href']) return urls
def _resolve_doi(self, canonical_pmid): xml_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + canonical_pmid[5:] + "&retmode=xml" # now dereference it and find out the target of the (chain of) 303(s) response = util.http_get(xml_url) if response.status_code != requests.codes.ok: raise plugin.PluginException(plugin.PluginException.HTTP, "unable to retrieve record from PubMed") try: xml = etree.fromstring(response.text.encode("utf-8")) except: log.error("Error parsing the XML from " + xml_url) return None, None xp = "/PubmedArticleSet/PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType='doi']" els = xml.xpath(xp) if len(els) == 0: # we didn't find a DOI return None, None # FIXME: we assume there is only one DOI in the record - is this really true? doi_string = els[0].text doi = DOIPlugin() canonical_doi = doi.canonical_form(doi_string) loc = doi.dereference(canonical_doi) return canonical_doi, loc
def resolve_doi(doi): url = "http://dx.doi.org/" + doi r = requests.head(url, timeout=config.CONN_TIMEOUT) if r.url != url: return r.url r, not_used_content, not_used_length = util.http_get(url) if r.url != url: return r.url return "DOI could not be resolved."
def dereference(self, canonical): resolvable = "http://dx.doi.org/" + canonical[4:] # now dereference it and find out the target of the (chain of) 303(s) response = util.http_get(resolvable) if response.status_code != requests.codes.ok: raise plugin.PluginException( plugin.PluginException.HTTP, "Unable to dereference DOI to valid endpoint, got " + str(response.status_code), ) return response.url
def test_03_http_get_long_timeout(self): resp = util.http_get(self.app_url + "/long_timeout") assert not resp
def test_02_http_get_timeout(self): resp = util.http_get(self.app_url + "/timeout") assert resp assert resp.text == "okay"
def test_01_http_get_normal(self): resp = util.http_get(self.app_url + "/normal") assert resp assert resp.text == "okay"
def license_detect(self, record): """ To respond to the provider identifier: http://elife.elifesciences.org This should determine the licence conditions of the eLife article and populate the record['bibjson']['license'] (note the US spelling) field. """ # List of licensing statements to look for on this publisher's pages. # In eLife's case they take the form of {xpath string: meaning object} # since we're not scraping HTML, we're using an XML API. # meaning['type'] identifies the license (see licenses.py) # and meaning['version'] identifies the license version (if available) elife_license_mappings = self._license_mappings # 1. get DOI from record object # doi = record['provider'].get('doi') doi = record.doi_without_prefix # it MUST NOT HAVE the canonical DOI prefix, "doi:" or "DOI:" if doi: # 2. query elife XML api url = 'http://elife.elifesciences.org/elife-source-xml/' + doi response = util.http_get(url) source_size = len(response.text) try: xml = etree.fromstring(response.text.decode("utf-8", "ignore")) except Exception as e: log.error("Error parsing the XML from " + url) log.error(e) return None # no point in doing anything else, so just do what # Python would do anyway upon reaching the end of this function # process the XML response namespaces = {'xlink': 'http://www.w3.org/1999/xlink'} for mapping in elife_license_mappings: xpath = mapping.keys()[0] meaning = mapping[xpath] elements = xml.xpath(xpath, namespaces=namespaces) if len(elements) > 0: lic_type = meaning['type'] # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # TODO later (or later version of OAG!) # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(meaning) # add provenance information to the license object provenance = { 'handler': self._short_name, 'handler_version': self.__version__, 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, "source_size" : source_size, 'agent': config.agent, 'category': 'xml_api', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': 'License decided by querying the eLife XML API at ' + url } license['provenance'] = provenance record.add_license_object(license) return (self._short_name, self.__version__)
def license_detect(self, record): # 1. get DOI from record object # doi = record['provider'].get('doi') doi = record.provider_doi # it MUST HAVE the canonical DOI prefix, "doi:" or "DOI:" if doi: # 2. query Elsevier XML api url = 'http://api.elsevier.com/content/article/' + doi response = util.http_get(url) response.encoding = 'utf-8' content = response.text source_size = len(content) if type(content) == str: content = content.decode('utf-8', 'replace') try: xml = etree.fromstring(content) except Exception as e: log.error("Error parsing the XML from " + url) log.error(e) return None # no point in doing anything else, so just do what # Python would do anyway upon reaching the end of this function # process the XML response namespaces = {'elsevierapi': 'http://www.elsevier.com/xml/svapi/article/dtd'} # is it open access at all? # case insensitive search for the value "true" in the relevant element xpath_oa = "//elsevierapi:openaccessArticle//text()[contains(translate(., 'EURT', 'eurt'), 'true')]" it_is_oa = len(xml.xpath(xpath_oa, namespaces=namespaces)) > 0 # now try to get the license too lic_type = None lic_version = None url_to_record = None xpath_license_extract = '//elsevierapi:openaccessUserLicense' elements = xml.xpath(xpath_license_extract, namespaces=namespaces) if len(elements) > 0: license_url = elements[0].text if license_url: cleaned_license_url = self.clean_url(license_url) urlparts = cleaned_license_url.split('/') if urlparts[0] == 'creativecommons.org': try: lic_type = 'cc-' + urlparts[2] # if we get to here we know what the license is, i.e. "a success" # so we can use the URL *they* specified url_to_record = license_url try: lic_version = urlparts[3] except IndexError: # we know which CC license but don't know which version # that's OK, just don't assert a version when creating # the license record below pass except IndexError: # it is a creative commons URL, but we can't find the license type part # so it's of no use .. all that's left is to slap free-to-read on it # if Elsevier says the article's OA if it_is_oa: lic_type = 'free-to-read' if it_is_oa and not lic_type: # Elsevier says the article's OA but we could not determine a license at all lic_type = 'free-to-read' meaning = {} if lic_type: meaning['type'] = lic_type if lic_version: meaning['version'] = lic_version if url_to_record: meaning['url'] = url_to_record if lic_type: # license identified, now use that to construct the license object license = deepcopy(LICENSES[lic_type]) license['open_access'] = oa_policy.oa_for_license(lic_type) # set some defaults which have to be there, even if empty license.setdefault('version','') license.setdefault('description','') license.setdefault('jurisdiction','') # Copy over all information about the license from the license # statement mapping. In essence, transfer the knowledge of the # publisher plugin authors to the license object. # Consequence: Values coming from the publisher plugin overwrite # values specified in the licenses module. license.update(meaning) # add provenance information to the license object provenance = { 'handler': self._short_name, 'handler_version': self.__version__, 'date': datetime.strftime(datetime.now(), config.date_format), 'source': url, "source_size" : source_size, 'agent': config.agent, 'category': 'xml_api', # TODO we need to think how the # users get to know what the values here mean.. docs? 'description': 'License decided by querying the Elsevier XML API at ' + url } license['provenance'] = provenance record.add_license_object(license) return (self._short_name, self.__version__)