def get_pdf_from_doi(self, doi, writefile, mode): ''' Downloads and writes a PDF article to a file, given a DOI and operating mode :param doi: DOI string for the article we want to download :type doi: str :param writefile: file object to write to :type writefile: file :param mode: choose from {'crossref' | 'elsevier' | 'rsc' | 'springer' | 'ecs' | 'nature' | 'acs'}, depending on how we wish to access the file :type mode: str :returns: True on successful write, False otherwise :rtype: bool ''' if mode == 'crossref': base_url = 'http://api.crossref.org/works/' api_url = base_url + doi headers = {'Accept': 'application/json'} try: response = json.loads( requests.get(api_url, headers=headers, timeout=self.timeout_sec).text) pdf_url = response['message']['link'][0]['URL'] app_type = str(response['message']['link'][0]['content-type']) if app_type in ['application/pdf', 'unspecified']: headers['Accept'] = 'application/pdf' r = requests.get(pdf_url, stream=True, headers=headers) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'elsevier': try: pdf_url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL' headers = { 'X-ELS-APIKEY': self.els_api_key, 'Accept': 'application/pdf' } r = requests.get(pdf_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: # API download limit exceeded return False return False if mode == 'rsc': scraper = scrapers.RSC() scrape_url = 'http://dx.doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: headers = {'Accept': 'application/pdf'} r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'ecs': scraper = scrapers.ECS() scrape_url = 'http://dx.doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: headers = {'Accept': 'application/pdf'} r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'nature': scraper = scrapers.Nature() scrape_url = 'http://dx.doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: headers = {'Accept': 'application/pdf'} r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'acs': base_url = 'http://pubs.acs.org/doi/pdf/' api_url = base_url + doi try: headers = { 'Accept': 'application/pdf', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'springer': base_url = 'http://link.springer.com/content/pdf/' api_url = base_url + doi try: headers = { 'Accept': 'application/pdf', 'User-agent': 'Mozilla/5.0' } r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False return False
def get_pdf_from_doi(self, doi, writefile, mode): ''' Downloads and writes a PDF article to a file, given a DOI and operating mode :param doi: DOI string for the article we want to download :type doi: str :param writefile: file object to write to :type writefile: file :param mode: either 'crossref' | 'elsevier' | 'rsc' | 'springer', depending on how we wish to access the file :type mode: str :returns: True on successful write, False otherwise :rtype: bool ''' if mode == 'crossref': base_url = 'http://api.crossref.org/works/' api_url = base_url + doi try: response = json.loads( requests.get(api_url, headers=self.headers).text) pdf_url = response['message']['link'][0]['URL'] app_type = response['message']['link'][0]['content-type'] if app_type == 'application/pdf': r = requests.get(pdf_url, stream=True, headers=self.headers) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False if mode == 'elsevier': if self.check_els_entitlement(doi): try: pdf_url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL' self.headers['Accept'] = 'application/pdf' r = requests.get(pdf_url, stream=True, headers=self.headers) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: # API download limit exceeded return False return False if mode == 'rsc': scraper = scrapers.RSC() scrape_url = 'http://dx.doi.org/' + doi download_url = None r = requests.get(scrape_url, headers=self.headers) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link if download_url is not None: r = requests.get(download_url, stream=True, headers=self.headers) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'springer': base_url = 'http://link.springer.com/' api_url = base_url + doi + '.pdf' try: self.headers['Accept'] = 'application/pdf' r = requests.get(api_url, stream=True, headers=self.headers) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False
def get_html_from_doi(self, doi, writefile, mode): ''' Downloads and writes an HTML article to a file, given a DOI and operating mode :param doi: DOI string for the article we want to download :type doi: str :param writefile: file object to write to :type writefile: file :param mode: either 'elsevier' | 'springer' | 'acs' | 'ecs' | 'rsc' | 'nature' | 'wiley' | 'aaas' | 'emerald', depending on how we wish to access the file :type mode: str :returns: True on successful write, False otherwise :rtype: bool ''' if mode == 'elsevier': if self.check_els_entitlement(doi): try: html_url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL' headers = { 'X-ELS-APIKEY': self.els_api_key, 'Accept': 'text/html' } r = requests.get(html_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: # API download limit exceeded return False return False if mode == 'springer': base_url = 'http://link.springer.com/' api_url = base_url + doi + '.html' try: headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'wiley': base_url = 'http://onlinelibrary.wiley.com/doi/' api_url = base_url + doi + '/full' try: headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'acs': base_url = 'http://pubs.acs.org/doi/full/' api_url = base_url + doi try: headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'emerald': base_url = 'http://www.emeraldinsight.com/doi/full/' api_url = base_url + doi try: headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} r = requests.get(api_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'rsc': scraper = scrapers.RSC() scrape_url = 'http://dx.doi.org/' + doi download_url = None r = requests.get(scrape_url, timeout=self.timeout_sec) if r.status_code == 200: scraper.feed(r.content) if scraper.download_link is not None: download_url = scraper.download_link download_url = download_url.replace( 'articlepdf', 'articlehtml') #Override for HTML mode if download_url is not None: headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False if mode == 'nature': download_url = 'http://dx.doi.org/' + doi headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False return False if mode == 'aaas': headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} article_url = 'http://dx.doi.org/' + doi resp = requests.get(article_url, headers=headers, timeout=self.timeout_sec) download_url = resp.url + doi + '.full' #Capture fulltext from redirect r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False return False if mode == 'ecs': headers = {'Accept': 'text/html', 'User-agent': 'Mozilla/5.0'} article_url = 'http://dx.doi.org/' + doi resp = requests.get(article_url, headers=headers, timeout=self.timeout_sec) download_url = resp.url + doi + '.full' #Capture fulltext from redirect r = requests.get(download_url, stream=True, headers=headers, timeout=self.timeout_sec) if r.status_code == 200: try: for chunk in r.iter_content(2048): writefile.write(chunk) return True except: return False return False return False