Ejemplo n.º 1
0
 def test_download_from_dict(self):
     with tempfile.TemporaryDirectory() as temp_dir:
         arxiv.download(self.paper_dict, dirpath=temp_dir)
         self.assertTrue(
             os.path.exists(
                 os.path.join(temp_dir,
                              '1605.08386v1.The_Paper_Title.pdf')))
Ejemplo n.º 2
0
def download(abstract_url: str, download_dir: str) -> None:
    paper = get_paper(abstract_url)
    print(f"saving '{paper.get('title')}'")
    arxiv.download(paper,
                   dirpath=download_dir,
                   slugify=lambda paper: paper.get("title"))
    print(f"saved to {download_dir}")
Ejemplo n.º 3
0
 def test_download_with_custom_slugify_from_dict(self):
     with tempfile.TemporaryDirectory() as temp_dir:
         arxiv.download(self.paper_dict,
                        slugify=custom_slugify,
                        dirpath=temp_dir)
         self.assertTrue(
             os.path.exists(os.path.join(temp_dir, '1605.08386v1.pdf')))
Ejemplo n.º 4
0
 def test_download_with_custom_slugify_from_dict(self):
     arxiv.download(self.paper_dict, slugify=custom_slugify, dirpath=self.temp_dir)
     self.assertTrue(
             os.path.exists(
                 os.path.join(self.temp_dir, '1605.08386v1.pdf')
             )
     )
Ejemplo n.º 5
0
    def _download_papers(self, folder_path):
        """
        Method for downloading retrieved papers.
        """

        for paper in self.fav_papers:
            arxiv.download(paper, dirpath=folder_path, slugify=self._custom_slugify)
Ejemplo n.º 6
0
def get_information_arxiv(q_key, f_max_results):

    keys = arxiv.query(
        search_query=q_key,
        max_results=f_max_results,
    )

    q_key = q_key.replace(" ", "_")
    if not os.path.exists(q_key):
        os.makedirs(q_key)
    liste_link = []
    for i in range(len(keys)):
        # print(keys[i]["authors"])
        # print(keys[i]["pdf_url"])
        # print(keys[i]["title"])
        # print("----------")

        liste_link.append(keys[i]["pdf_url"])

        try:
            arxiv.download(keys[i])
            shutil.move(keys[i]["title"] + ".pdf", q_key)
        except:
            pass
            os.remove(keys[i]["title"] + ".pdf")
Ejemplo n.º 7
0
 def test_download_from_query(self):
     arxiv.download(self.paper_query, dirpath=self.temp_dir)
     self.assertTrue(
         os.path.exists(
             os.path.join(
                 self.temp_dir,
                 '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf'
             )))
Ejemplo n.º 8
0
 def test_download_tarfile_from_dict(self):
     arxiv.download(self.paper_dict,
                    dirpath=self.temp_dir,
                    prefer_source_tarfile=True)
     self.assertTrue(
         os.path.exists(
             os.path.join(self.temp_dir,
                          '1605.08386v1.The_Paper_Title.tar.gz')))
Ejemplo n.º 9
0
 def test_download_from_dict(self):
     arxiv.download(self.paper_dict, dirpath=self.temp_dir)
     self.assertTrue(
             os.path.exists(
                 os.path.join(
                     self.temp_dir,
                     '1605.08386v1.The_Paper_Title.pdf')
             )
     )
Ejemplo n.º 10
0
 def test_download_tarfile_from_query(self):
     arxiv.download(self.paper_query,
                    dirpath=self.temp_dir,
                    prefer_source_tarfile=True)
     self.assertTrue(
         os.path.exists(
             os.path.join(
                 self.temp_dir,
                 '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.tar.gz'
             )))
Ejemplo n.º 11
0
 def test_download_from_query(self):
     arxiv.download(self.paper_query,
                    max_results=1,
                    save_path=self.temp_dir)
     self.assertTrue(
         os.path.exists(
             os.path.join(
                 self.temp_dir,
                 'The_Multi_Agent_Reinforcement_Learning_in_MalmÖ_MARLÖ_Competition.pdf'
             )))
Ejemplo n.º 12
0
    def test_download_from_query(self):
        with tempfile.TemporaryDirectory() as temp_dir:
            arxiv.download(self.paper_query, dirpath=temp_dir)

            self.assertTrue(
                os.path.exists(
                    os.path.join(
                        temp_dir,
                        '1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf'
                    )))
Ejemplo n.º 13
0
def main(argv):
  try:
    opts, args = getopt.getopt(argv, "q:l:o", ["query=", "limit=", "output="])
  except:
    print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input'
    sys.exit(2)

  # Init defaults
  limit = 10
  query = False
  outputDirectory = "./input"

  # Parse arguments
  for opt, arg in opts:
    if opt == "-q":
      query = arg
    elif opt == "-h":
      print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input'
    elif opt == "l":
      limit = arg
    elif opt == "o":
      outputDirectory = arg

  if not(query):
    print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input'
    sys.exit(2)

  if not(os.path.isdir(outputDirectory)):
    print "Given output directory is not a directory."
    sys.exit(2)
  else:
    os.chdir(outputDirectory)

  res = arxiv.query(query, prune=True, start=0, max_results=limit)

  i = 0
  results = []

  # Display titles
  for elem in res:
    for key in elem:
      if key == "title":
        i += 1
        results.append(elem)
        print str(i) + ". " + elem[key] + "\n"

  toDownload = input("Enter the numbers of the papers you want to download separated by commas: \n")

  try:
    iterator = iter(toDownload)
  except TypeError:
    arxiv.download(results[toDownload])
  else:
    for elem in toDownload:
      arxiv.download(results[elem])
Ejemplo n.º 14
0
def download_pdf():
    """Download papers listed in meta_data folder"""
    df = pd.read_csv('meta_data/papers.csv', usecols=['id', 'pdf_url', 'title'])

    for i, (id, title, pdf_url) in enumerate(zip(df.id, df.title, df.pdf_url)):
        if i % 100 == 0:
            print(i)
        try:
            arxiv.download({'pdf_url': pdf_url, 'title': title})
            move(title + '.pdf', str(Path('papers', id + '.pdf')))
        except Exception as e:
            print(e)
        time.sleep(2.5)
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dir', type=str, default='.', help='directory name')
    parser.add_argument('--id', type=str, required=True, help='arXiv id')
    parser.add_argument('--prepend',
                        action='store_true',
                        help='prepend paper id')
    parser.add_argument('--slugify',
                        action='store_true',
                        help='slugify file name')
    args = parser.parse_args()

    paper_obj = arxiv.query(id_list=[args.id])[0]
    arxiv.download(paper_obj, args.dir, args.prepend, args.slugify)
def pickle2pdf(target_category='cs.LG'):
    #flag = 'This article has been withdrawn' 
    filelist = []
    for year in range(2012,2020):
        file = os.path.join(os.getcwd(),'data/arxiv/{}_papers.pkl'.format(year))
        filelist.append(file)
    print('filelist ready')
    num_suc = 0 ; num_fail = 0
    global c
    c = ''
    for f_i, file in enumerate(filelist):
        papers = loadpk(file)
        print('pickled paper loaded {}'.format(f_i))
        def custom_slugify(obj):
            name = obj.get('id').split('/')[-1]
            #time = obj.get('published').split('T')
            res = 'data/arxiv/{}/pdf/'.format(c) + name#+ time + '_' + name
            return  res#obj.get('id').split('/')[-1]
        
        for paper in papers:
            arxiv_id = paper['arxivid']
            category = paper['categories']
            if 'cs.LG' in category:
                if 'cs.CL' in category:
                    c = 'LG_CL'
                else:
                    c = 'LG'
            elif 'cs.CL' in category:
                c = 'CL'
            else:
                pass
            
            try:
                d_paper = ax.query(id_list=[arxiv_id])[0]
                ax.download(d_paper, slugify=custom_slugify)
#                res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1]
#                all_pdf_path.append(res)
                print('download {} {} succeed.'.format(arxiv_id, c))
                with open('suc_ids.txt', 'a') as w:
                    w.write('{}\t{}\n'.format(arxiv_id,c))
                    w.close()
                num_suc += 1
            except:
                print('----------download {} {} failed'.format(arxiv_id,c))
                with open('faild_ids.txt', 'a') as w:
                    w.write('{}\t{}\n'.format(arxiv_id,c))
                    w.close()
                num_fail += 1
    print('num_suc: {} , num_fail: {}'.format(num_suc, num_fail))
    return
 def paper_download(self):
     # paper_name = self.get_paper_name()
     self.make_download_directory()
     author_dir_paths = self.make_author_directory()
     print("author_dir_paths", author_dir_paths)
     for author_dir_path in author_dir_paths:
         try:
             arxiv.download(obj=self.paper,
                            dirpath=f'{author_dir_path}',
                            slugify=self.get_paper_name)
             print("author_dir_path:", author_dir_path)
             time.sleep(3)
         except Exception as e:
             logging.warning(f"download_error {e}")
             time.sleep(3)
Ejemplo n.º 18
0
def download_from_arxiv(title, my_api_key, my_cse_id, dirname='./'):
    """Download arxiv paper from the title

    Args:
        title (str): Full title of the paper
        dirname (str): Output directory of the pdf after downloaded

    Returns:
        Filename of the downloaded pdf

    """
    title = '"' + title.replace('-', ' ') + '"'  # Put quote for exact search
    title = remove_non_ascii(title)

    title_prepended = 'ti:' + title
    results = arxiv.query(title_prepended)

    logging.debug(results)
    outfile = get_filename(dirname, title)
    if not os.path.isfile(outfile):
        if paper_available_on_arxiv(results):
            logging.info("Downloading from arxiv: " + title)
            return arxiv.download(
                results[0], dirname=dirname, slugify=True
            )  # When slugify is True, the paper title will be stripped of non-alphanumeric characters before being used as a filename.
        else:
            google_query = title + ' filetype:PDF'
            logging.info("Paper is not on arxiv, downloading from google: " +
                         google_query)
            download_from_google(google_query, my_api_key, my_cse_id, outfile)
    else:
        logging.info(
            "Paper has already been downloaded previously. Will skip downloading this file (%s)",
            title)
        return None
Ejemplo n.º 19
0
def create_clinks_set(item, dirpath):
    paper = {'pdf_url': item['pdf_url'], "title": item['title']}
    summary = item['summary']
    title = item['title']
    summary = title + '. ' + summary

    keywords = rake.apply(summary)
    path = arxiv.download(paper, dirpath=dirpath)

    cid_to = client.add(path)['Hash']
    cid_from_list = []

    for keyword in keywords:
        if keyword[1] > RANK:
            temp = client.add_str(keyword[0])
            cid_from_list.append(temp)
        else:
            pass

    cid_from_list.append(client.add_str(title))

    data = {'title': title, 'cid_from': cid_from_list, 'cid_to': cid_to}

    clinks_df = pd.DataFrame.from_dict(data)
    return clinks_df
Ejemplo n.º 20
0
def download_source(eprint='2012.06888', bib=False, dat=False):
    """Download a tar file from arXiv and choose the right file."""

    filename = eprint.replace('/', '-')
    paper = arxiv.query(id_list=[eprint])[0]
    paper = arxiv.download(paper, prefer_source_tarfile=True)
    this_tarfile = tarfile.open(paper, 'r')
    tarfiles = {}
    file_count = 0
    for this_file in this_tarfile.getnames():
        file_type_regex = re.compile(r'^.*(tex|xml|txt)$')
        if bib:
            file_type_regex = re.compile(r'^.*(tex|xml|txt|bib|bbl|inc)$')
        if dat:
            file_type_regex = re.compile(r'^.*(tex|xml|txt|dat)$')
        if file_type_regex.match(this_file) or VERBOSE:
            file_count += 1
            tarfiles[file_count] = this_file
            print(file_count, tarfiles[file_count])
    if file_count == 1:
        file_choice = file_count
    else:
        file_choice = input('Choose a file: ')
        file_choice = int(file_choice)
    source_file = this_tarfile.extractfile(tarfiles[file_choice])
    file_type = re.match(r'.*(\w{3})$', tarfiles[file_choice]).group(1)
    output = open(filename + '.' + file_type, 'wb')
    output.write(source_file.read())
    output.close()
    os.unlink(paper)
    return file_type
Ejemplo n.º 21
0
    def test_download_on_sort(self):
        asc_papers = download(query='rnn',
                              max_results=3,
                              sort_by='submittedDate',
                              sort_order='ascending')

        desc_papers = download(query='rnn',
                               max_results=3,
                               sort_by='submittedDate',
                               sort_order='descending')

        self.assertEqual(type(asc_papers), list)
        self.assertEqual(len(asc_papers), 3)

        self.assertEqual(type(desc_papers), list)
        self.assertEqual(len(desc_papers), 3)

        for i, paper in enumerate(asc_papers):

            self.assertIn('title', paper)
            self.assertIn('abstract', paper)
            self.assertIn('authors', paper)
            self.assertIn('publication_time', paper)
            self.assertIn('arxiv_url', paper)
            self.assertIn('pdf_url', paper)
            self.assertIn('journal_reference', paper)

            if i > 0:
                self.assertGreaterEqual(paper['publication_time'],
                                        asc_papers[i - 1]['publication_time'])

        for i, paper in enumerate(desc_papers):

            self.assertIn('title', paper)
            self.assertIn('abstract', paper)
            self.assertIn('authors', paper)
            self.assertIn('publication_time', paper)
            self.assertIn('arxiv_url', paper)
            self.assertIn('pdf_url', paper)
            self.assertIn('journal_reference', paper)

            if i > 0:
                self.assertGreaterEqual(desc_papers[i - 1]['publication_time'],
                                        paper['publication_time'])
def pickle2pdf(target_category='cs.LG'):
    #flag = 'This article has been withdrawn'
    num_withdraw = 0
    num_abstract = 0
    filelist = []
    all_pdf_path = []
    for year in range(2012, 2013):
        file = os.path.join(os.getcwd(),
                            'data/arxiv/{}_papers.pkl'.format(year))
        filelist.append(file)
    print('filelist ready')
    data_path = os.path.join(os.getcwd(), 'data/arxiv')
    w = os.path.join(data_path, 'train_{}.txt'.format(target_category))
    log = os.path.join(data_path, 'log.txt')
    n = 0
    for f_i, file in enumerate(filelist):
        per_num_withdraw = 0
        per_num_abstract = 0
        papers = loadpk(file)
        print('pickled paper loaded {}'.format(f_i))

        def custom_slugify(obj):
            name = obj.get('id').split('/')[-1]
            #time = obj.get('published').split('T')
            res = 'data/arxiv/{}/pdf/'.format(
                target_category) + name  #+ time + '_' + name
            print(res)
            return res  #obj.get('id').split('/')[-1]

        for paper in papers:
            if n > 10:
                break
            arxiv_id = paper['arxivid']
            try:
                d_paper = ax.query(id_list=[arxiv_id])[0]
                ax.download(d_paper, slugify=custom_slugify)


#                res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1]
#                all_pdf_path.append(res)
            except:
                print('download {} failed'.format(arxiv_id))
            n += 1
    return
Ejemplo n.º 23
0
 def download(self, slugify, reload_download_list=False):
     if reload_download_list:
         self.prepare_for_download()
     if not self.to_download:
         print('Download list is empty. Trying to reload.')
         self.prepare_for_download()
         self.download()
     else:
         print('Download started...')
         for elem in tqdm.tqdm(self.to_download):
             try:
                 arxiv.download(elem,
                                slugify=slugify,
                                dirpath=self.download_path)
             except:
                 print(
                     'Something went wrong during downloading {0} by url {1}. Passed'
                     .format(elem['title'], elem['pdf_url']))
         print('Download finished!')
Ejemplo n.º 24
0
 def retrieve(self):
     results = iter(self._results)
     if self.keep:
         save_dir = Path.cwd()
     else:
         save_dir = None
     with TemporaryDirectory(dir=save_dir) as temp_dir:
         for result in results:
             filename = arxiv.download(result, dirpath=temp_dir)
             yield self.read_pdf(filename)
Ejemplo n.º 25
0
    def download(self):
        """
        Changes to paper directory.
        Downlaods pdfs for each paper.
        """

        if not os.path.exists('./papers'):
            os.mkdir('./papers')

        os.chdir('./papers')

        base_url = 'https://arxiv.org/pdf/'
        for link in self.pdf_links:
            print('Downloading: {}'.format(link))
            paper = {
                'pdf_url': base_url + link,
                'title': link.split('.pdf')[0]
            }
            try:
                arxiv.download(paper)
            except:
                print('Download: {} failed unexpectedly.'.format(link))
                pass
Ejemplo n.º 26
0
    def test_download_on_query(self):
        papers = download(query='rnn', max_results=2)

        self.assertEqual(type(papers), list)
        self.assertEqual(len(papers), 2)

        for paper in papers:

            self.assertIn('title', paper)
            self.assertIn('abstract', paper)
            self.assertIn('authors', paper)
            self.assertIn('publication_time', paper)
            self.assertIn('arxiv_url', paper)
            self.assertIn('pdf_url', paper)
            self.assertIn('journal_reference', paper)
    def getDocumentTextFile(self, document):
        document_pdf_file_path = arxiv.download(document, DATA_DIRECTORY_PATH)

        try:
            document_text = textract.process(document_pdf_file_path,
                                             encoding="utf-8")
        except:
            #print "\t\t (Conversion to text failed)"
            os.remove(document_pdf_file_path)
        else:
            os.remove(document_pdf_file_path)
            document_text_file_path = DATA_DIRECTORY_PATH + document_pdf_file_path[:-len(
                "pdf")] + "txt"
            document_text_file = open(document_text_file_path, "w")
            document_text_file.write(document_text)
            document_text_file.close()

        return document_text_file_path
Ejemplo n.º 28
0
def download_arxiv_paper(arxiv_id, feel_lucky: bool = True):
    """
    Input: arxiv_id
    Return:
        the downloaded paper folder.
    """

    if not os.path.exists(paper_download_dir):
        os.makedirs(paper_download_dir)

    # Don't download again if exists
    search_this_paper = glob.glob(f'{paper_download_dir}/{arxiv_id}*/')
    if len(search_this_paper) > 0:
        return search_this_paper[0]

    tarpath = arxiv.download({'pdf_url': f"http://arxiv.org/pdf/{arxiv_id}"},
                             dirpath=paper_download_dir,
                             prefer_source_tarfile=True)
    with tarfile.open(tarpath, 'r') as tar:
        tar.extractall(path=tarpath.replace('.tar.gz', ''))
        return tarpath.replace('.tar.gz', '')
Ejemplo n.º 29
0
    def test_download_on_filters(self):

        search = 'Multi-Agent Reinforcement Learning'
        author = 'devlin'
        journal = 'nips'
        query = construct_query(search, author, journal)

        before = "2019-10-12"
        after = "2018-10-11"
        _before = datetime.strptime(before, "%Y-%m-%d")
        _after = datetime.strptime(after, "%Y-%m-%d")

        papers = download(query=query,
                          max_results=1,
                          before=_before,
                          after=_after)

        self.assertEqual(type(papers), list)
        self.assertEqual(len(papers), 1)

        for paper in papers:

            self.assertIn('title', paper)
            self.assertIn('abstract', paper)
            self.assertIn('authors', paper)
            self.assertIn('publication_time', paper)
            self.assertIn('arxiv_url', paper)
            self.assertIn('pdf_url', paper)
            self.assertIn('journal_reference', paper)

            self.assertTrue(search in paper['title']
                            or search in paper['abstract'])
            self.assertTrue(
                any([author.lower() in a.lower() for a in paper['authors']]))
            self.assertIn(journal.lower(), paper['journal_reference'].lower())

            self.assertGreaterEqual(before, paper['publication_time'])
            self.assertGreaterEqual(paper['publication_time'], after)
Ejemplo n.º 30
0
 def loadArticles(self, inpString, directory, count=10):
     newDirFl = True
     try:
         os.mkdir(directory)
     except FileExistsError:
         newDirFl = False
     articles = arxiv.query(query=inpString, max_results=count * 2)
     indexFile = open(os.path.join(directory, "index.txt"), 'a')
     for article in articles:
         try:
             filename = arxiv.download(article, dirpath=directory)
             try:
                 pdf = PyPDF2.PdfFileReader(filename)
                 count -= 1
                 indexFile.write(
                     os.path.basename(article.id) + ' ' +
                     os.path.basename(filename) + ' 0\n')
             except Exception:
                 os.remove(filename)
         except FileExistsError:
             pass
         if count == 0:
             break
     indexFile.close()
# coding: utf-8

# In[1]:

import arxiv
import argparse
import os
from tqdm import tqdm
# In[6]:

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-l', '--list', required=True)
    parser.add_argument('-o', '--output_dir', default='./')

    args = parser.parse_args()
    if os.path.exists(args.output_dir) == False:
        os.makedirs(args.output_dir)
    l = open(args.list).readlines()
    for i in tqdm(range(len(l))):
        ti = l[i]
        q = 'ti:"{0}"'.format(ti.strip()).replace('-', ' ')
        print q
        r = arxiv.query(q)
        try:
            d = arxiv.download(dirname=args.output_dir, obj=r[0])
        except Exception as e:
            print e
            print 'failed downloading: ', ti
Ejemplo n.º 32
0
def get_arxiv_link(bot, msg):
    results = arxiv.query(msg['text'].replace("/arxiv ", ""), max_results=1)
    bot.sender.sendMessage("Title: {}\nAuthor: {}".format(results[0]['title'], results[0]['author']))
    bot.sender.sendDocument(open(arxiv.download(results[0]), "rb"))