Example #1
0
 def _scrape_urls(self, canonical_pmid):
     """
     return a list of urls which might be a suitable provider from the NCBI page
     """
     ncbi_url = "http://www.ncbi.nlm.nih.gov/pubmed/" + canonical_pmid[5:]
     resp = util.http_get(ncbi_url)
     if resp.status_code != 200:
         return []
     
     soup = BeautifulSoup(resp.text)
     
     # first look for the canonical link under the "icons" class div
     icons = soup.find(class_="icons")
     if icons is not None:
         anchors = icons.find_all("a")
         if len(anchors) > 0:
             return [anchors[0]['href']]
         
     # if we don't find an "icons" div, then we need to scrape from the "linkoutlist"
     linkout = soup.find_all(class_="linkoutlist")
     if len(linkout) == 0:
         return []
     anchors = linkout[0].find_all("a")
     if len(anchors) == 0:
         return []
     
     urls = []
     for a in anchors:
         urls.append(a['href'])
     return urls
Example #2
0
    def _resolve_doi(self, canonical_pmid):
        xml_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + canonical_pmid[5:] + "&retmode=xml"
        
        # now dereference it and find out the target of the (chain of) 303(s)
        response = util.http_get(xml_url)
        if response.status_code != requests.codes.ok:
            raise plugin.PluginException(plugin.PluginException.HTTP, "unable to retrieve record from PubMed")

        try:
            xml = etree.fromstring(response.text.encode("utf-8"))
        except:
            log.error("Error parsing the XML from " + xml_url)
            return None, None
        
        xp = "/PubmedArticleSet/PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType='doi']"
        els = xml.xpath(xp)
        
        if len(els) == 0:
            # we didn't find a DOI
            return None, None
            
        # FIXME: we assume there is only one DOI in the record - is this really true?
        doi_string = els[0].text
        doi = DOIPlugin()
        canonical_doi = doi.canonical_form(doi_string)
        loc = doi.dereference(canonical_doi)
        
        return canonical_doi, loc
def resolve_doi(doi):
    url = "http://dx.doi.org/" + doi
    r = requests.head(url, timeout=config.CONN_TIMEOUT)
    if r.url != url:
        return r.url
    r, not_used_content, not_used_length = util.http_get(url)
    if r.url != url:
        return r.url
    return "DOI could not be resolved."
Example #4
0
    def dereference(self, canonical):
        resolvable = "http://dx.doi.org/" + canonical[4:]

        # now dereference it and find out the target of the (chain of) 303(s)
        response = util.http_get(resolvable)
        if response.status_code != requests.codes.ok:
            raise plugin.PluginException(
                plugin.PluginException.HTTP,
                "Unable to dereference DOI to valid endpoint, got " + str(response.status_code),
            )

        return response.url
 def test_03_http_get_long_timeout(self):
     resp = util.http_get(self.app_url + "/long_timeout")
     assert not resp
 def test_02_http_get_timeout(self):
     resp = util.http_get(self.app_url + "/timeout")
     assert resp
     assert resp.text == "okay"
 def test_01_http_get_normal(self):
     resp = util.http_get(self.app_url + "/normal")
     assert resp
     assert resp.text == "okay"
Example #8
0
    def license_detect(self, record):
        """
        To respond to the provider identifier: http://elife.elifesciences.org
        
        This should determine the licence conditions of the eLife article and populate
        the record['bibjson']['license'] (note the US spelling) field.
        """

        # List of licensing statements to look for on this publisher's pages.
        # In eLife's case they take the form of {xpath string: meaning object}
        # since we're not scraping HTML, we're using an XML API.
        # meaning['type'] identifies the license (see licenses.py)
        # and meaning['version'] identifies the license version (if available)
        elife_license_mappings = self._license_mappings

        # 1. get DOI from record object
        # doi = record['provider'].get('doi')
        doi = record.doi_without_prefix  # it MUST NOT HAVE the canonical DOI prefix, "doi:" or "DOI:"

        if doi:
        # 2. query elife XML api
            url = 'http://elife.elifesciences.org/elife-source-xml/' + doi
            response = util.http_get(url)
            source_size = len(response.text)

            try:
                xml = etree.fromstring(response.text.decode("utf-8", "ignore"))
            except Exception as e:
                log.error("Error parsing the XML from " + url)
                log.error(e)
                return None  # no point in doing anything else, so just do what
                             # Python would do anyway upon reaching the end of this function
        
            # process the XML response
            namespaces = {'xlink': 'http://www.w3.org/1999/xlink'}

            for mapping in elife_license_mappings:
                xpath = mapping.keys()[0]
                meaning = mapping[xpath]
                elements = xml.xpath(xpath, namespaces=namespaces)

                if len(elements) > 0:
                    lic_type = meaning['type']
        
                    # license identified, now use that to construct the license object
                    license = deepcopy(LICENSES[lic_type])
                    license['open_access'] = oa_policy.oa_for_license(lic_type)
                    # set some defaults which have to be there, even if empty
                    license.setdefault('version','')
                    license.setdefault('description','')
                    license.setdefault('jurisdiction','') # TODO later (or later version of OAG!)
        
                    # Copy over all information about the license from the license
                    # statement mapping. In essence, transfer the knowledge of the 
                    # publisher plugin authors to the license object.
                    # Consequence: Values coming from the publisher plugin overwrite
                    # values specified in the licenses module.
                    license.update(meaning)
        
                    # add provenance information to the license object
                    provenance = {
                        'handler': self._short_name,
                        'handler_version': self.__version__,
                        'date': datetime.strftime(datetime.now(), config.date_format),
                        'source': url,
                        "source_size" : source_size,
                        'agent': config.agent,
                        'category': 'xml_api', # TODO we need to think how the
                            # users get to know what the values here mean.. docs?
                        'description': 'License decided by querying the eLife XML API at ' + url
                    }
        
                    license['provenance'] = provenance
                    record.add_license_object(license)

        return (self._short_name, self.__version__)
    def license_detect(self, record):
        # 1. get DOI from record object
        # doi = record['provider'].get('doi')
        doi = record.provider_doi  # it MUST HAVE the canonical DOI prefix, "doi:" or "DOI:"

        if doi:
        # 2. query Elsevier XML api
            url = 'http://api.elsevier.com/content/article/' + doi
            response = util.http_get(url)

            response.encoding = 'utf-8'
            content = response.text
            source_size = len(content)
            if type(content) == str:
                content = content.decode('utf-8', 'replace')
            
            try:
                xml = etree.fromstring(content)
            except Exception as e:
                log.error("Error parsing the XML from " + url)
                log.error(e)
                return None  # no point in doing anything else, so just do what
                             # Python would do anyway upon reaching the end of this function
        
            # process the XML response
            namespaces = {'elsevierapi': 'http://www.elsevier.com/xml/svapi/article/dtd'}

            # is it open access at all?
            # case insensitive search for the value "true" in the relevant element
            xpath_oa = "//elsevierapi:openaccessArticle//text()[contains(translate(., 'EURT', 'eurt'), 'true')]"
            it_is_oa = len(xml.xpath(xpath_oa, namespaces=namespaces)) > 0

            # now try to get the license too
            lic_type = None
            lic_version = None
            url_to_record = None

            xpath_license_extract = '//elsevierapi:openaccessUserLicense'
            elements = xml.xpath(xpath_license_extract, namespaces=namespaces)
            if len(elements) > 0:
                license_url = elements[0].text

                if license_url:
                    cleaned_license_url = self.clean_url(license_url)

                    urlparts = cleaned_license_url.split('/')
                    if urlparts[0] == 'creativecommons.org':
                        try:
                            lic_type = 'cc-' + urlparts[2]
                            # if we get to here we know what the license is, i.e. "a success"
                            # so we can use the URL *they* specified
                            url_to_record = license_url
                            try:
                                lic_version = urlparts[3]
                            except IndexError:
                                # we know which CC license but don't know which version
                                # that's OK, just don't assert a version when creating
                                # the license record below
                                pass
                        except IndexError:
                            # it is a creative commons URL, but we can't find the license type part
                            # so it's of no use .. all that's left is to slap free-to-read on it
                            # if Elsevier says the article's OA
                            if it_is_oa:
                                lic_type = 'free-to-read'

            if it_is_oa and not lic_type:
                # Elsevier says the article's OA but we could not determine a license at all
                lic_type = 'free-to-read'

            meaning = {}
            if lic_type:
                meaning['type'] = lic_type
            if lic_version:
                meaning['version'] = lic_version
            if url_to_record:
                meaning['url'] = url_to_record

            if lic_type:
                # license identified, now use that to construct the license object
                license = deepcopy(LICENSES[lic_type])
                license['open_access'] = oa_policy.oa_for_license(lic_type)
                # set some defaults which have to be there, even if empty
                license.setdefault('version','')
                license.setdefault('description','')
                license.setdefault('jurisdiction','')

                # Copy over all information about the license from the license
                # statement mapping. In essence, transfer the knowledge of the
                # publisher plugin authors to the license object.
                # Consequence: Values coming from the publisher plugin overwrite
                # values specified in the licenses module.
                license.update(meaning)

                # add provenance information to the license object
                provenance = {
                    'handler': self._short_name,
                    'handler_version': self.__version__,
                    'date': datetime.strftime(datetime.now(), config.date_format),
                    'source': url,
                    "source_size" : source_size,
                    'agent': config.agent,
                    'category': 'xml_api', # TODO we need to think how the
                        # users get to know what the values here mean.. docs?
                    'description': 'License decided by querying the Elsevier XML API at ' + url
                }

                license['provenance'] = provenance
                record.add_license_object(license)

        return (self._short_name, self.__version__)