def test_download_from_dict(self): with tempfile.TemporaryDirectory() as temp_dir: arxiv.download(self.paper_dict, dirpath=temp_dir) self.assertTrue( os.path.exists( os.path.join(temp_dir, '1605.08386v1.The_Paper_Title.pdf')))
def download(abstract_url: str, download_dir: str) -> None: paper = get_paper(abstract_url) print(f"saving '{paper.get('title')}'") arxiv.download(paper, dirpath=download_dir, slugify=lambda paper: paper.get("title")) print(f"saved to {download_dir}")
def test_download_with_custom_slugify_from_dict(self): with tempfile.TemporaryDirectory() as temp_dir: arxiv.download(self.paper_dict, slugify=custom_slugify, dirpath=temp_dir) self.assertTrue( os.path.exists(os.path.join(temp_dir, '1605.08386v1.pdf')))
def test_download_with_custom_slugify_from_dict(self): arxiv.download(self.paper_dict, slugify=custom_slugify, dirpath=self.temp_dir) self.assertTrue( os.path.exists( os.path.join(self.temp_dir, '1605.08386v1.pdf') ) )
def _download_papers(self, folder_path): """ Method for downloading retrieved papers. """ for paper in self.fav_papers: arxiv.download(paper, dirpath=folder_path, slugify=self._custom_slugify)
def get_information_arxiv(q_key, f_max_results): keys = arxiv.query( search_query=q_key, max_results=f_max_results, ) q_key = q_key.replace(" ", "_") if not os.path.exists(q_key): os.makedirs(q_key) liste_link = [] for i in range(len(keys)): # print(keys[i]["authors"]) # print(keys[i]["pdf_url"]) # print(keys[i]["title"]) # print("----------") liste_link.append(keys[i]["pdf_url"]) try: arxiv.download(keys[i]) shutil.move(keys[i]["title"] + ".pdf", q_key) except: pass os.remove(keys[i]["title"] + ".pdf")
def test_download_from_query(self): arxiv.download(self.paper_query, dirpath=self.temp_dir) self.assertTrue( os.path.exists( os.path.join( self.temp_dir, '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf' )))
def test_download_tarfile_from_dict(self): arxiv.download(self.paper_dict, dirpath=self.temp_dir, prefer_source_tarfile=True) self.assertTrue( os.path.exists( os.path.join(self.temp_dir, '1605.08386v1.The_Paper_Title.tar.gz')))
def test_download_from_dict(self): arxiv.download(self.paper_dict, dirpath=self.temp_dir) self.assertTrue( os.path.exists( os.path.join( self.temp_dir, '1605.08386v1.The_Paper_Title.pdf') ) )
def test_download_tarfile_from_query(self): arxiv.download(self.paper_query, dirpath=self.temp_dir, prefer_source_tarfile=True) self.assertTrue( os.path.exists( os.path.join( self.temp_dir, '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.tar.gz' )))
def test_download_from_query(self): arxiv.download(self.paper_query, max_results=1, save_path=self.temp_dir) self.assertTrue( os.path.exists( os.path.join( self.temp_dir, 'The_Multi_Agent_Reinforcement_Learning_in_MalmÖ_MARLÖ_Competition.pdf' )))
def test_download_from_query(self): with tempfile.TemporaryDirectory() as temp_dir: arxiv.download(self.paper_query, dirpath=temp_dir) self.assertTrue( os.path.exists( os.path.join( temp_dir, '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf' )))
def main(argv): try: opts, args = getopt.getopt(argv, "q:l:o", ["query=", "limit=", "output="]) except: print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input' sys.exit(2) # Init defaults limit = 10 query = False outputDirectory = "./input" # Parse arguments for opt, arg in opts: if opt == "-q": query = arg elif opt == "-h": print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input' elif opt == "l": limit = arg elif opt == "o": outputDirectory = arg if not(query): print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input' sys.exit(2) if not(os.path.isdir(outputDirectory)): print "Given output directory is not a directory." sys.exit(2) else: os.chdir(outputDirectory) res = arxiv.query(query, prune=True, start=0, max_results=limit) i = 0 results = [] # Display titles for elem in res: for key in elem: if key == "title": i += 1 results.append(elem) print str(i) + ". " + elem[key] + "\n" toDownload = input("Enter the numbers of the papers you want to download separated by commas: \n") try: iterator = iter(toDownload) except TypeError: arxiv.download(results[toDownload]) else: for elem in toDownload: arxiv.download(results[elem])
def download_pdf(): """Download papers listed in meta_data folder""" df = pd.read_csv('meta_data/papers.csv', usecols=['id', 'pdf_url', 'title']) for i, (id, title, pdf_url) in enumerate(zip(df.id, df.title, df.pdf_url)): if i % 100 == 0: print(i) try: arxiv.download({'pdf_url': pdf_url, 'title': title}) move(title + '.pdf', str(Path('papers', id + '.pdf'))) except Exception as e: print(e) time.sleep(2.5)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dir', type=str, default='.', help='directory name') parser.add_argument('--id', type=str, required=True, help='arXiv id') parser.add_argument('--prepend', action='store_true', help='prepend paper id') parser.add_argument('--slugify', action='store_true', help='slugify file name') args = parser.parse_args() paper_obj = arxiv.query(id_list=[args.id])[0] arxiv.download(paper_obj, args.dir, args.prepend, args.slugify)
def pickle2pdf(target_category='cs.LG'): #flag = 'This article has been withdrawn' filelist = [] for year in range(2012,2020): file = os.path.join(os.getcwd(),'data/arxiv/{}_papers.pkl'.format(year)) filelist.append(file) print('filelist ready') num_suc = 0 ; num_fail = 0 global c c = '' for f_i, file in enumerate(filelist): papers = loadpk(file) print('pickled paper loaded {}'.format(f_i)) def custom_slugify(obj): name = obj.get('id').split('/')[-1] #time = obj.get('published').split('T') res = 'data/arxiv/{}/pdf/'.format(c) + name#+ time + '_' + name return res#obj.get('id').split('/')[-1] for paper in papers: arxiv_id = paper['arxivid'] category = paper['categories'] if 'cs.LG' in category: if 'cs.CL' in category: c = 'LG_CL' else: c = 'LG' elif 'cs.CL' in category: c = 'CL' else: pass try: d_paper = ax.query(id_list=[arxiv_id])[0] ax.download(d_paper, slugify=custom_slugify) # res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1] # all_pdf_path.append(res) print('download {} {} succeed.'.format(arxiv_id, c)) with open('suc_ids.txt', 'a') as w: w.write('{}\t{}\n'.format(arxiv_id,c)) w.close() num_suc += 1 except: print('----------download {} {} failed'.format(arxiv_id,c)) with open('faild_ids.txt', 'a') as w: w.write('{}\t{}\n'.format(arxiv_id,c)) w.close() num_fail += 1 print('num_suc: {} , num_fail: {}'.format(num_suc, num_fail)) return
def paper_download(self): # paper_name = self.get_paper_name() self.make_download_directory() author_dir_paths = self.make_author_directory() print("author_dir_paths", author_dir_paths) for author_dir_path in author_dir_paths: try: arxiv.download(obj=self.paper, dirpath=f'{author_dir_path}', slugify=self.get_paper_name) print("author_dir_path:", author_dir_path) time.sleep(3) except Exception as e: logging.warning(f"download_error {e}") time.sleep(3)
def download_from_arxiv(title, my_api_key, my_cse_id, dirname='./'): """Download arxiv paper from the title Args: title (str): Full title of the paper dirname (str): Output directory of the pdf after downloaded Returns: Filename of the downloaded pdf """ title = '"' + title.replace('-', ' ') + '"' # Put quote for exact search title = remove_non_ascii(title) title_prepended = 'ti:' + title results = arxiv.query(title_prepended) logging.debug(results) outfile = get_filename(dirname, title) if not os.path.isfile(outfile): if paper_available_on_arxiv(results): logging.info("Downloading from arxiv: " + title) return arxiv.download( results[0], dirname=dirname, slugify=True ) # When slugify is True, the paper title will be stripped of non-alphanumeric characters before being used as a filename. else: google_query = title + ' filetype:PDF' logging.info("Paper is not on arxiv, downloading from google: " + google_query) download_from_google(google_query, my_api_key, my_cse_id, outfile) else: logging.info( "Paper has already been downloaded previously. Will skip downloading this file (%s)", title) return None
def create_clinks_set(item, dirpath): paper = {'pdf_url': item['pdf_url'], "title": item['title']} summary = item['summary'] title = item['title'] summary = title + '. ' + summary keywords = rake.apply(summary) path = arxiv.download(paper, dirpath=dirpath) cid_to = client.add(path)['Hash'] cid_from_list = [] for keyword in keywords: if keyword[1] > RANK: temp = client.add_str(keyword[0]) cid_from_list.append(temp) else: pass cid_from_list.append(client.add_str(title)) data = {'title': title, 'cid_from': cid_from_list, 'cid_to': cid_to} clinks_df = pd.DataFrame.from_dict(data) return clinks_df
def download_source(eprint='2012.06888', bib=False, dat=False): """Download a tar file from arXiv and choose the right file.""" filename = eprint.replace('/', '-') paper = arxiv.query(id_list=[eprint])[0] paper = arxiv.download(paper, prefer_source_tarfile=True) this_tarfile = tarfile.open(paper, 'r') tarfiles = {} file_count = 0 for this_file in this_tarfile.getnames(): file_type_regex = re.compile(r'^.*(tex|xml|txt)$') if bib: file_type_regex = re.compile(r'^.*(tex|xml|txt|bib|bbl|inc)$') if dat: file_type_regex = re.compile(r'^.*(tex|xml|txt|dat)$') if file_type_regex.match(this_file) or VERBOSE: file_count += 1 tarfiles[file_count] = this_file print(file_count, tarfiles[file_count]) if file_count == 1: file_choice = file_count else: file_choice = input('Choose a file: ') file_choice = int(file_choice) source_file = this_tarfile.extractfile(tarfiles[file_choice]) file_type = re.match(r'.*(\w{3})$', tarfiles[file_choice]).group(1) output = open(filename + '.' + file_type, 'wb') output.write(source_file.read()) output.close() os.unlink(paper) return file_type
def test_download_on_sort(self): asc_papers = download(query='rnn', max_results=3, sort_by='submittedDate', sort_order='ascending') desc_papers = download(query='rnn', max_results=3, sort_by='submittedDate', sort_order='descending') self.assertEqual(type(asc_papers), list) self.assertEqual(len(asc_papers), 3) self.assertEqual(type(desc_papers), list) self.assertEqual(len(desc_papers), 3) for i, paper in enumerate(asc_papers): self.assertIn('title', paper) self.assertIn('abstract', paper) self.assertIn('authors', paper) self.assertIn('publication_time', paper) self.assertIn('arxiv_url', paper) self.assertIn('pdf_url', paper) self.assertIn('journal_reference', paper) if i > 0: self.assertGreaterEqual(paper['publication_time'], asc_papers[i - 1]['publication_time']) for i, paper in enumerate(desc_papers): self.assertIn('title', paper) self.assertIn('abstract', paper) self.assertIn('authors', paper) self.assertIn('publication_time', paper) self.assertIn('arxiv_url', paper) self.assertIn('pdf_url', paper) self.assertIn('journal_reference', paper) if i > 0: self.assertGreaterEqual(desc_papers[i - 1]['publication_time'], paper['publication_time'])
def pickle2pdf(target_category='cs.LG'): #flag = 'This article has been withdrawn' num_withdraw = 0 num_abstract = 0 filelist = [] all_pdf_path = [] for year in range(2012, 2013): file = os.path.join(os.getcwd(), 'data/arxiv/{}_papers.pkl'.format(year)) filelist.append(file) print('filelist ready') data_path = os.path.join(os.getcwd(), 'data/arxiv') w = os.path.join(data_path, 'train_{}.txt'.format(target_category)) log = os.path.join(data_path, 'log.txt') n = 0 for f_i, file in enumerate(filelist): per_num_withdraw = 0 per_num_abstract = 0 papers = loadpk(file) print('pickled paper loaded {}'.format(f_i)) def custom_slugify(obj): name = obj.get('id').split('/')[-1] #time = obj.get('published').split('T') res = 'data/arxiv/{}/pdf/'.format( target_category) + name #+ time + '_' + name print(res) return res #obj.get('id').split('/')[-1] for paper in papers: if n > 10: break arxiv_id = paper['arxivid'] try: d_paper = ax.query(id_list=[arxiv_id])[0] ax.download(d_paper, slugify=custom_slugify) # res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1] # all_pdf_path.append(res) except: print('download {} failed'.format(arxiv_id)) n += 1 return
def download(self, slugify, reload_download_list=False): if reload_download_list: self.prepare_for_download() if not self.to_download: print('Download list is empty. Trying to reload.') self.prepare_for_download() self.download() else: print('Download started...') for elem in tqdm.tqdm(self.to_download): try: arxiv.download(elem, slugify=slugify, dirpath=self.download_path) except: print( 'Something went wrong during downloading {0} by url {1}. Passed' .format(elem['title'], elem['pdf_url'])) print('Download finished!')
def retrieve(self): results = iter(self._results) if self.keep: save_dir = Path.cwd() else: save_dir = None with TemporaryDirectory(dir=save_dir) as temp_dir: for result in results: filename = arxiv.download(result, dirpath=temp_dir) yield self.read_pdf(filename)
def download(self): """ Changes to paper directory. Downlaods pdfs for each paper. """ if not os.path.exists('./papers'): os.mkdir('./papers') os.chdir('./papers') base_url = 'https://arxiv.org/pdf/' for link in self.pdf_links: print('Downloading: {}'.format(link)) paper = { 'pdf_url': base_url + link, 'title': link.split('.pdf')[0] } try: arxiv.download(paper) except: print('Download: {} failed unexpectedly.'.format(link)) pass
def test_download_on_query(self): papers = download(query='rnn', max_results=2) self.assertEqual(type(papers), list) self.assertEqual(len(papers), 2) for paper in papers: self.assertIn('title', paper) self.assertIn('abstract', paper) self.assertIn('authors', paper) self.assertIn('publication_time', paper) self.assertIn('arxiv_url', paper) self.assertIn('pdf_url', paper) self.assertIn('journal_reference', paper)
def getDocumentTextFile(self, document): document_pdf_file_path = arxiv.download(document, DATA_DIRECTORY_PATH) try: document_text = textract.process(document_pdf_file_path, encoding="utf-8") except: #print "\t\t (Conversion to text failed)" os.remove(document_pdf_file_path) else: os.remove(document_pdf_file_path) document_text_file_path = DATA_DIRECTORY_PATH + document_pdf_file_path[:-len( "pdf")] + "txt" document_text_file = open(document_text_file_path, "w") document_text_file.write(document_text) document_text_file.close() return document_text_file_path
def download_arxiv_paper(arxiv_id, feel_lucky: bool = True): """ Input: arxiv_id Return: the downloaded paper folder. """ if not os.path.exists(paper_download_dir): os.makedirs(paper_download_dir) # Don't download again if exists search_this_paper = glob.glob(f'{paper_download_dir}/{arxiv_id}*/') if len(search_this_paper) > 0: return search_this_paper[0] tarpath = arxiv.download({'pdf_url': f"http://arxiv.org/pdf/{arxiv_id}"}, dirpath=paper_download_dir, prefer_source_tarfile=True) with tarfile.open(tarpath, 'r') as tar: tar.extractall(path=tarpath.replace('.tar.gz', '')) return tarpath.replace('.tar.gz', '')
def test_download_on_filters(self): search = 'Multi-Agent Reinforcement Learning' author = 'devlin' journal = 'nips' query = construct_query(search, author, journal) before = "2019-10-12" after = "2018-10-11" _before = datetime.strptime(before, "%Y-%m-%d") _after = datetime.strptime(after, "%Y-%m-%d") papers = download(query=query, max_results=1, before=_before, after=_after) self.assertEqual(type(papers), list) self.assertEqual(len(papers), 1) for paper in papers: self.assertIn('title', paper) self.assertIn('abstract', paper) self.assertIn('authors', paper) self.assertIn('publication_time', paper) self.assertIn('arxiv_url', paper) self.assertIn('pdf_url', paper) self.assertIn('journal_reference', paper) self.assertTrue(search in paper['title'] or search in paper['abstract']) self.assertTrue( any([author.lower() in a.lower() for a in paper['authors']])) self.assertIn(journal.lower(), paper['journal_reference'].lower()) self.assertGreaterEqual(before, paper['publication_time']) self.assertGreaterEqual(paper['publication_time'], after)
def loadArticles(self, inpString, directory, count=10): newDirFl = True try: os.mkdir(directory) except FileExistsError: newDirFl = False articles = arxiv.query(query=inpString, max_results=count * 2) indexFile = open(os.path.join(directory, "index.txt"), 'a') for article in articles: try: filename = arxiv.download(article, dirpath=directory) try: pdf = PyPDF2.PdfFileReader(filename) count -= 1 indexFile.write( os.path.basename(article.id) + ' ' + os.path.basename(filename) + ' 0\n') except Exception: os.remove(filename) except FileExistsError: pass if count == 0: break indexFile.close()
# coding: utf-8 # In[1]: import arxiv import argparse import os from tqdm import tqdm # In[6]: if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-l', '--list', required=True) parser.add_argument('-o', '--output_dir', default='./') args = parser.parse_args() if os.path.exists(args.output_dir) == False: os.makedirs(args.output_dir) l = open(args.list).readlines() for i in tqdm(range(len(l))): ti = l[i] q = 'ti:"{0}"'.format(ti.strip()).replace('-', ' ') print q r = arxiv.query(q) try: d = arxiv.download(dirname=args.output_dir, obj=r[0]) except Exception as e: print e print 'failed downloading: ', ti
def get_arxiv_link(bot, msg): results = arxiv.query(msg['text'].replace("/arxiv ", ""), max_results=1) bot.sender.sendMessage("Title: {}\nAuthor: {}".format(results[0]['title'], results[0]['author'])) bot.sender.sendDocument(open(arxiv.download(results[0]), "rb"))