class Tester(TestCase): def setUp(self): self.downloader = ArticleDownloader(environ.get('ELS_API_KEY')) self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,') def test_download(self): #Single download test self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'elsevier') self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'crossref') self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'rsc') def test_abstract_download(self): self.downloader.get_abstract_from_doi(self.doi, 'elsevier') def test_entitlement(self): #Test entitlement - want to check this works, but pass/fail depends on IP addr self.assertTrue(self.downloader.check_els_entitlement(self.doi) in [True, False]) def test_search(self): #Search test queries = self.downloader.load_queries_from_csv(self.csv_file) for query in queries: self.downloader.get_dois_from_search(query, rows=1000) def tearDown(self): pass
def setUp(self): self.downloader = ArticleDownloader(environ.get('ELS_API_KEY')) self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,')
class Tester(TestCase): def setUp(self): self.downloader = ArticleDownloader(environ.get('ELS_API_KEY')) self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,') def test_download(self): #Single download test self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'elsevier') def test_abstract_download(self): self.downloader.get_abstract_from_doi(self.doi, 'elsevier') def test_search(self): #Search test queries = self.downloader.load_queries_from_csv(self.csv_file) for query in queries: self.downloader.get_dois_from_search(query, rows=10) def tearDown(self): pass
class Tester(TestCase): def setUp(self): self.downloader = ArticleDownloader(environ.get('ELS_API_KEY')) self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,') def test_download(self): #Single download test self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'elsevier') def test_abstract_download(self): self.downloader.get_abstract_from_doi(self.doi, 'elsevier') def test_search(self): #Search test queries = self.downloader.load_queries_from_csv(self.csv_file) for query in queries: self.downloader.get_dois_from_search(query, rows=10) def tearDown(self): pass
def setUp(self): self.downloader = ArticleDownloader('NO_API_KEY') self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,')
class Tester(TestCase): def setUp(self): self.downloader = ArticleDownloader('NO_API_KEY') self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,') def test_download(self): #Single download test self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'elsevier') self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'crossref') def test_entitlement(self): #Test entitlement self.assertFalse(self.downloader.check_els_entitlement(self.doi)) def test_search(self): #Search test queries = self.downloader.load_queries_from_csv(self.csv_file) for query in queries: self.downloader.get_dois_from_search(query, rows=1200) def tearDown(self): pass
class Tester(TestCase): def setUp(self): self.downloader = ArticleDownloader('NO_API_KEY') self.doi = '10.1016/j.nantod.2008.10.014' self.pdf_file = TemporaryFile(mode='wb') self.txt_file = TemporaryFile(mode='rb+') self.txt_file.write('10.1016/j.nantod.2008.10.014') self.csv_file = TemporaryFile(mode='rb+') self.csv_file.write('nanomaterial+synthesis,') self.csv_file.write('battery+electrode,') def test_download(self): #Single download test self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'elsevier') self.downloader.get_pdf_from_doi(self.doi, self.pdf_file, 'crossref') def test_entitlement(self): #Test entitlement self.assertFalse(self.downloader.check_els_entitlement(self.doi)) def test_search(self): #Search test queries = self.downloader.load_queries_from_csv(self.csv_file) for query in queries: self.downloader.get_dois_from_search(query, rows=1200) def tearDown(self): pass
class DownloadManager: ad = ArticleDownloader(environ.get('ELS_API_KEY'), environ.get('CRF_API_KEY')) connection = MongoClient() dl_doi_pdf_map = {} doi_fails = [] dl_dois = [] rows_per_query = 0 def __init__(self, db): self.db = db def set_dois_per_query(self, num_docs): self.rows_per_query = int(num_docs) def get_dois(self, queries, mode, wait_time=0): if mode == 'mp': self.__logger.info('Searching with MP queries') mpr = MPRester(environ.get('MAPI_KEY'), endpoint="https://www.materialsproject.org/rest") mpids = [] for query in queries: try: entries = mpr.get_entries(query) for entry in entries: mpids.extend(entry.data['task_ids']) for mpid in mpids: mpid = mpr.get_materials_id_from_task_id( mpid)['materials_id'] bibtex = mpr.get_materials_id_references(mpid) parsed_bibtex = bibtexparser.loads(bibtex) for item in parsed_bibtex.entries: if 'doi' in item: if item['doi'] not in self.dl_dois: self.dl_dois.append(item['doi']) except: self.__logger.warning( 'FAILURE: Failed to get DOIs from MP:' + str(query)) elif mode == 'cr': self.__logger.info('Searching with CR queries') for query in queries: dl_dois = [] try: dl_dois = self.ad.get_dois_from_search( query, rows=self.rows_per_query) except Exception, e: self.__logger.warning( 'FAILURE: Failed to get DOIs from CR: ' + str(query)) self.__logger.warning('EXCEPTION: ' + str(e)) sleep(wait_time) self.dl_dois.extend(dl_dois) elif mode == 'issn': self.__logger.info('Searching with ISSN queries') for query in queries: dl_dois = [] try: dl_dois = self.ad.get_dois_from_journal_issn( query, rows=self.rows_per_query, pub_after=1900) except Exception, e: self.__logger.warning( 'FAILURE: Failed to get DOIs from CR by ISSN: ' + str(query)) self.__logger.warning('EXCEPTION: ' + str(e)) sleep(wait_time) self.dl_dois.extend(dl_dois)
for sec in c.iter(): if sec.tag.find('section-title') != -1: #print(sec.text) rawtext += sec.text + '\n' if sec.tag.find('para') != -1: #print(sec.text) rawtext += sec.text + '\n' #print(rawtext) return rawtext restclient.add_resource(resource_name='thesis') #text = findText('./elsevier/A quick method for the simultaneous determination of ascorbic acid and sorbic acid in fruit juices by capillary zone el.xml') #sys.exit(0) try: downloader = ArticleDownloader( els_api_key='e88e30b8118b3ed42ca752c0d6b59686') #https://api.elsevier.com/content/search/sciencedirect?query=nutrition&APIKey=e88e30b8118b3ed42ca752c0d6b59686 #dois = downloader.get_dois_from_journal_issn('1476-4686', rows=500, pub_after=2000) filetype = 'xml' #78 is for elsevier records = downloader.get_dict_from_search( 'ascorbic acid+extraction+fruit&filter=member:78', 3000) for i, record in enumerate(records): print(i) cur_title = re.sub('[\[\]\'\.\/]', '', str(record['title'])) replaced_doi = re.sub('[\[\]\'\.\/()]', '', str(record['doi'])) print(replaced_doi) cur_filename = './elsevier/' + replaced_doi + '.' + filetype try: my_file = open(cur_filename, 'wb') # Need to use 'wb' on Windows
import bibtexparser import requests import os from articledownloader.articledownloader import ArticleDownloader import time from sys import platform downloader = ArticleDownloader(els_api_key='11acc1dbb49e1a44d49d46d48469a2f7') if platform == "linux" or platform == "linux2": cpdf = './cpdf-binaries/Linux-Intel-64bit/cpdf' elif platform == "darwin": cpdf = './cpdf-binaries/OSX-Intel/cpdf' elif platform == "win32": cpdf = './cpdf-binaries/Windows32bit/cpdf.exe' aux_lines = [] with open('publist_biobib.aux') as aux_file: aux_lines = aux_file.readlines() with open('bib_publications.bib') as bibtex_file: bib_database = bibtexparser.load(bibtex_file) article_dir = 'Research_Articles' os.makedirs(article_dir, exist_ok=True) for l in bib_database.entries: print('Checking %s' % l['ID']) if 'doi' in l and 'keywords' in l and 'recent' in l['keywords']: name = '' for line in aux_lines: