def main(argv): try: opts, args = getopt.getopt(argv, "q:l:o", ["query=", "limit=", "output="]) except: print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input' sys.exit(2) # Init defaults limit = 10 query = False outputDirectory = "./input" # Parse arguments for opt, arg in opts: if opt == "-q": query = arg elif opt == "-h": print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input' elif opt == "l": limit = arg elif opt == "o": outputDirectory = arg if not(query): print 'Use syntax: python arxiv-downloader.py -q "network node centrality" -l 50 -o ./input' sys.exit(2) if not(os.path.isdir(outputDirectory)): print "Given output directory is not a directory." sys.exit(2) else: os.chdir(outputDirectory) res = arxiv.query(query, prune=True, start=0, max_results=limit) i = 0 results = [] # Display titles for elem in res: for key in elem: if key == "title": i += 1 results.append(elem) print str(i) + ". " + elem[key] + "\n" toDownload = input("Enter the numbers of the papers you want to download separated by commas: \n") try: iterator = iter(toDownload) except TypeError: arxiv.download(results[toDownload]) else: for elem in toDownload: arxiv.download(results[elem])
def arxiv_search(query, message): try: arxiv_search_res = arxiv.query(search_query=query, max_results=3) query_answer = '' for paper in arxiv_search_res: end = '…' if len(paper['summary']) > 251 else '' a_name = paper['authors'][0] if len(paper['authors'])>1: a_name += 'et al.' query_answer += \ '• {0}. <a href="{1}">{2}</a>. {3}{4}\n'.format( a_name, paper['arxiv_url'], escape(paper['title'].replace('\n', ' ')), escape(paper['summary'][0:250].replace('\n', ' ')), end) print(query_answer) user_action_log(message, "called arxiv search with query {}".format(query)) my_bot.reply_to(message, query_answer, parse_mode="HTML") except Exception as ex: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] action_log("Unknown Exception:\n{}: {}\nat {} line {}\n" "Creating the alert file.".format(exc_type, ex, fname, exc_tb.tb_lineno))
shutil.move(pdf, path + pdf) print("Move the original pdf to " + path) else: if not os.path.exists(non_arxiv_dir): os.makedirs(non_arxiv_dir) shutil.move(pdf, non_arxiv_dir + pdf) print('Move the original pdf to ' + non_arxiv_dir) print('') time.sleep(waittime) continue metadata = PdfReader(pdf).Info if '/arxiv_id' in metadata and '/updated' in metadata: paper = arxiv.query( id_list=[metadata['/arxiv_id'][1:-1]])[0] updated = metadata['/updated'][1:-1] else: paper = arxiv.query(id_list=[pdf[:-4].split('v')[0]])[0] updated = '' authors = paper['authors'] title = ' '.join(paper['title'].split()) year = str(paper['published_parsed'].tm_year) arxiv_id = paper['id'].split('/')[-1].split('v')[0] authors = paper['authors'][0].split(' ')[-1] title = title.replace(':', '').replace(',', '').replace( '-', ' ').replace('/', ' ') pdf_name = authors + ' (' + year + ') - ' + title + '.pdf' print()
def arxiv_query_title(title): """Query arxiv for papers with given title.""" query = 'ti:"{}"'.format(title.replace('-', ' ')) return arxiv.query(search_query=query)
def update_articles(): # Updating records ordered_articles = pd.read_json("DeepLearningArticles.json", orient='index') unique_id = article_id(ordered_articles) version_no = [item[-1] for item in unique_id] # Dictionary for existing article {unique id: version number} article_dict = {} for i in range(len(unique_id)): article_dict[unique_id[i][:-2]] = version_no[i] # Getting new articles, change max_results new_articles = arxiv.query(search_query, max_results=100, sort_by="lastUpdatedDate", sort_order="descending") new_articles_df = pd.DataFrame.from_dict(new_articles) ordered_new_articles = new_articles_df.reindex(columns=[ 'title', 'author', 'authors', 'id', 'arxiv_comment', 'arxiv_primary_category', 'published', 'summary', 'tags', 'updated' ]) unique_id_new = article_id(ordered_new_articles) prefix = 'http://arxiv.org/abs/' counter = 0 for item in unique_id_new: article_key = item[:-2] # Adding newly published articles (new articles appended at the front) if article_key not in article_dict: article_dict[article_key] = item[-1] ordered_articles = pd.concat([ ordered_new_articles[ lambda ordered_new_articles: ordered_new_articles[ 'id'] == prefix + item], ordered_articles ], axis=0, sort=False, ignore_index=True) print("Added a new paper.") counter += 1 # Updating old versions else: if int(item[-1]) > int(article_dict[article_key]): old_article_id = prefix + item[:-1] + article_dict[article_key] article_index = ordered_articles.index.get_loc( ordered_articles.index[ordered_articles['id'] == old_article_id][0]) ordered_articles.drop(labels=article_index, axis=0, inplace=True) article_dict[article_key] = item[-1] ordered_articles = pd.concat([ ordered_new_articles[ lambda ordered_new_articles: ordered_new_articles[ 'id'] == prefix + item], ordered_articles ], axis=0, sort=False, ignore_index=True) print("Updated a newer version.") counter += 1 ordered_articles.to_json(OUTPUT_FILE, orient='index') prettify_json(OUTPUT_FILE) print("Update completed: {} updates made.".format(counter))
from datetime import datetime import re import requests from googletrans import Translator from time import sleep import arxiv #webhook POST先URL API_URL = "https://maker.ifttt.com/trigger/arxivLine/with/key/fdpKKfnpX20wqLHVNK2r4zom5lnmyU3jlBVzZ6zAfk2" #検索ワード QUERY = "cat:'astro-ph.IM" result_list = arxiv.query(query=QUERY, max_results=2, sort_by='submittedDate') translator = Translator() dt = datetime.now().strftime("%Y/%m/%d %H:%M:%S") requests.post(API_URL, data={"value1": dt}) def translate_post(): title_jpn = translator.translate(title, src='en', dest='ja').text abst_jpn = translator.translate(abst, src='en', dest='ja').text print("---------" + str(count) + "ページ目----------") print("author{}".format(author)) print(url) print("title:{}".format(title_jpn)) print("date:{}".format(date)) print("Abstract:{}".format(abst_jpn)) message = "\n".join([
async def arxiv_random(message): user_action_log(message, "made arxiv random query") try: eastern = pytz.timezone('US/Eastern') eastern_time = datetime.datetime.now(eastern) # publications on 20:00 if eastern_time.hour < 20: eastern_time -= datetime.timedelta(days=1) # no publications on friday and saturday if eastern_time.weekday() == 5: eastern_time -= datetime.timedelta(days=2) elif eastern_time.weekday() == 4: eastern_time -= datetime.timedelta(days=1) last_published_date = eastern_time.strftime("%Y-%m-%d") response = requests.get('http://export.arxiv.org/oai2', params={ 'verb': 'ListIdentifiers', 'set': 'math', 'metadataPrefix': 'oai_dc', 'from': last_published_date }) action_log("Random arxiv paper since {}".format(last_published_date)) # если всё хорошо if response.status_code == 200: response_tree = ElementTree.fromstring(response.content) num_of_papers = len(response_tree[2]) paper_index = random.randint(0, num_of_papers) paper_arxiv_id = response_tree[2][paper_index][0].text.split(':')[ -1] # hardcoded papep_obj = arxiv.query(id_list=[paper_arxiv_id])[0] paper_link = papep_obj['pdf_url'].replace('http://', 'https://') + '.pdf' paper_link_name = paper_link.split("/pdf/")[1] print(paper_link) print(paper_link_name) req_pdf_size = requests.head(paper_link) pdf_size = round( int(req_pdf_size.headers["Content-Length"]) / 1024 / 1024, 2) query_answer = '{}. <a href="{}">{}</a>. {}\n\n— <a href="{}">{}</a>, {} Мб\n'.format( papep_obj['author_detail']['name'], papep_obj['arxiv_url'], escape(papep_obj['title'].replace('\n', ' ')), escape(papep_obj['summary'].replace('\n', ' ')), paper_link, paper_link_name, pdf_size) await message.reply(query_answer, parse_mode="HTML", disable_web_page_preview=False) user_action_log( message, "arxiv random query was successful: " "got paper {}".format(papep_obj['arxiv_url'])) # TODO(randl): doesn't send. Download and delete? # my_bot.send_document(message.chat.id, data=paper_link) elif response.status_code == 503: # слишком часто запрашиваем action_log("Too much queries. 10 minutes break should be enough") arxiv_checker.last_call = datetime.datetime.utcnow( ) - datetime.timedelta(seconds=610) else: # если всё плохо user_action_log( message, "arxiv random query failed: response {}".format( response.status_code)) except Exception as ex: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] action_log("Unknown Exception: {}: {}\nat {} line {}".format( exc_type, ex, fname, exc_tb.tb_lineno))
if not dics["urls"] and not dics["photos"]: print("skip", flush=True) continue # arxivのリンク情報を埋める for url in dics["urls"]: # Arxivだった場合 if 'arxiv' in url: arxivid_ = url.split('/')[-1] ids = [s for s in arxivid_.split('.') if s[0].isdigit()] arxivid__ = "" for i in range(len(ids)): arxivid__ += ids[i] + "." arxivid = arxivid__[:-1] arxivtitle = arxiv.query(id_list=[arxivid])[0]['title'] # Youtubeの場合 if 'youtu' in url: # if it is youtube link imgurl = subprocess.check_output("youtube-dl --get-thumbnail \"" + url + "\"", shell=True) youtubetitle = subprocess.check_output( "youtube-dl --get-title \"" + url + "\"", shell=True) # solve decode problem if type(imgurl) == bytes: imgurl = imgurl.decode("utf-8", "ignore") if type(youtubetitle) == bytes: youtubetitle = youtubetitle.decode("utf-8", "ignore") # photosの画像があるならそちらを優先
#!/usr/bin/env python import pprint # import requests import arxiv import pandas as pd from googletrans import Translator translator = Translator() # p_list = arxiv.query(query='au:"Grisha Perelman"') p_list = arxiv.query(query='au:"Henggang Cui"') # num = len(p_list) # print(type(l)) # print(type(l[0])) #pprint.pprint(l[0], width=200) for i in p_list: print( "\n\n\n----------------------------------------------------------------------------------------------------" ) print("\nタイトル:\n" + translator.translate(i['title'], src='en', dest='ja').text + '\n(' + i['title'] + ')') print("\npublished:\n" + i['published']) print("\nauthors:") for j in i['authors']:
file.write(response.read()) file.close() print('下载论文' + " " + savePath) uploaded = drive.CreateFile({'title': savePath}) #uploaded.SetContentString(response.read()) uploaded.SetContentFile('./' + savePath) uploaded.Upload() print("保存至google drive 云盘成功") os.remove('./' + savePath) keywords = 'Machine Learning' auth.authenticate_user() gauth = GoogleAuth() gauth.credentials = GoogleCredentials.get_application_default() drive = GoogleDrive(gauth) print('接入google drive') list = axv.query(search_query=keywords, start=0, max_results=2000) print('搜索arxiv上关键字:{} 相关的论文'.format(keywords)) print('共找到arxiv收录论文{}篇'.format(len(list))) print('开始下载...') for obj in iter(list): filename = obj['title'] + ".pdf" filename = filename.replace(' ', '_').replace('/', '_').replace('\n', '_') pdf_url = obj['pdf_url'] download_arxiv_pdf(drive, filename, pdf_url)
##### Arxive.org ############################## import arxiv aa = arxiv.query(search_query='machine', start=0, max_results=10)
def get_arxiv_link(bot, msg): results = arxiv.query(msg['text'].replace("/arxiv ", ""), max_results=1) bot.sender.sendMessage("Title: {}\nAuthor: {}".format(results[0]['title'], results[0]['author'])) bot.sender.sendDocument(open(arxiv.download(results[0]), "rb"))
def search_n(self, query, n): self.logger.info(query) ret = arxiv.query(query=query, max_results=n) return self.shaping(ret)
def search_n_random(self, query, n): self.logger.info(query) ret = arxiv.query(query=query, max_results=n) results = random.sample(ret, 4) if len(ret) >= 4 else ret return self.shaping(results)
def main(): # ################### # # SETUP ARGS # # ################### # parser = argparse.ArgumentParser( description='Fetch all information for papers') parser.add_argument('database', metavar='Database', type=str, help='database name of data collection') parser.add_argument('number_papers', metavar='Number of Papers', type=int, help='number of papers to be downloaded') parser.add_argument( 'skip_items', metavar='Number items to skip', type=int, help='number of items to skip from returned collection') parser.add_argument( 'version', metavar='Version of overview csv', type=int, help='versioning number used to name overview csv for databases') args = parser.parse_args() database = args.database number_papers = args.number_papers skip_items = args.skip_items version = args.version client = MongoClient('localhost:4321') db = client.pub booktitles = ['TREC'] # ########################### # # FETCH PUBLICATIONS # # ########################### # # print("Fetching publication information from TSE-NER server; publication attributes, has_pdf, number_entities, #citations_pub, #citations_author: ") for booktitle in booktitles: papers = [] paper_info = [ ] #[_id, number_entities, year, ee, dblpkey, journal, title, type] counter_pub = 0 counter_pdf = 0 counter_cit = 0 facets_columns = ';'.join(facets) results = db.publications.find({ 'booktitle': booktitle }).skip(skip_items).limit(number_papers).batch_size(100) print( f'Fetching {results.count(True)} out of {results.count()} total publications information for conference: {booktitle}' ) querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) # querier.save_cookies() scholar_query = scholar.SearchScholarQuery() for pub in results: if not pub['title'] or not pub['authors']: continue counter_pub += 1 author1 = pub['authors'][0] title = pub['title'].lower().capitalize().strip('.') paper_info = [ pub['_id'], 'false', '-1;-1', '-1', booktitle, pub['ee'], pub['year'], "'%s'" % title, pub['type'], 'author1;author2' ] no_accent_author1 = unidecode.unidecode(author1) # Set author info authors = '' for author in pub['authors']: authors += f'{author};' paper_info[9] = authors.strip(';') pdf_file_path = f'{ROOTPATH}/data/{database}/{booktitle.lower()}/pdf/' os.makedirs(os.path.dirname(pdf_file_path), exist_ok=True) # Have multiple PDF fetch methods: Direct EE link, Arxiv # ADD ELSAPY if pub['ee'][-4:].lower() == '.pdf': paper_info[1] = 'true' download_pdf(pdf_file_path, paper_info[5], database, booktitle, paper_info[0]) else: arxiv_query = f'au:{author1}+AND+ti:{title}' articles = arxiv.query(search_query=arxiv_query) if articles: for art in articles: if art['title'].lower().capitalize().strip( '.') == title: paper_info[5] = art['pdf_url'] paper_info[1] = 'true' arxiv.download(art, pdf_file_path, slugify=True) os.rename( f'{pdf_file_path}{to_slug(art["title"])}.pdf', f'{pdf_file_path}{paper_info[0]}.pdf') print(f'Finished PDF download for {paper_info[0]}') # Download full text if 'content' in pub.keys() and 'fulltext' in pub['content'].keys(): write_full_text_file(paper_info[0], database, booktitle, pub['content']['fulltext']) # Get distinct #entities for total facets # ADD PROPER ENTITIES EXTRACTION facets_entities = '' for facet in facets: entities = fetch_paper_entities(pub['_id'], facet, db) facets_entities += f'{len(entities)};' # Write paper facet entity set to TXT write_entity_set_file(pub['_id'], booktitle, entities, database, facet) paper_info[2] = facets_entities.strip(';') # Only fetch citations if a PDF has been downloaded if paper_info[1] == 'true': counter_pdf += 1 # Get number of citations info scholar_query.set_author(no_accent_author1) scholar_query.set_phrase(title) scholar_query.set_num_page_results(1) querier.send_query(scholar_query) # Print the URL of the first article found if querier.articles and title == querier.articles[0][ 'title'].lower().capitalize().strip('.'): print( f'Fetched number citations for {paper_info[0]}: {querier.articles[0]["num_citations"]}' ) paper_info[3] = querier.articles[0]['num_citations'] counter_cit += 1 # Add paper information to list papers.append(paper_info) print(f'✓ {pub["_id"]}') # Write papers information to CSV file if counter_pub % 20 is 0: print('----- STATISTICS -----') print("Processed:", counter_pub) write_arrays_to_csv(papers, booktitle, database, [ 'paper_id', 'has_pdf', facets_columns, 'number_citations', 'booktitle', 'pdf_url', 'year', 'title', 'type', 'authors' ], skip_items, version) print( f'PDFs downloaded for {counter_pdf}/{counter_pub} publications for {booktitle}' ) print('----------------------') print('----- FINAL STATISTICS -----') print("Processed:", counter_pub) write_arrays_to_csv(papers, booktitle, database, [ 'paper_id', 'has_pdf', facets_columns, 'number_citations', 'booktitle', 'pdf_url', 'year', 'title', 'type', 'authors' ], skip_items, version) print( f'PDFs downloaded for {counter_pdf}/{counter_pub} publications for {booktitle}' ) print('-----------------------') print( f'Finished processing {counter_pub} publications and downloading {counter_pdf} PDFs for {booktitle}' )
"-t", "--topic", type=str, nargs=1, default="text summarization", help= 'Topic to query arXiv for (surround in quotes if topic includes spaces). Default: "text summarization"' ) parser.add_argument( "-n", "--num_results", type=int, default=10, help="Number of research articles to download from arXiv. Default: 10") args = parser.parse_args() results = arxiv.query(query=args.topic, max_results=args.num_results, iterative=True) for i, paper in enumerate(results()): paper_info = {'title': paper['title'], 'url': paper['pdf_url']} pdf_filename = paper_info['url'].split('/')[-1] pdf_dir = "../downloads/" os.makedirs(pdf_dir, exist_ok=True) arxiv.arxiv.download(paper, dirpath=pdf_dir)
#def getrefname(b): #i,j = b.index('{'), b.index(',') #return b[i+1:j] for refname, arxivid, selected in publist: query = "SELECT {} from documents WHERE bibtex LIKE '%{}%'".format(','.join(headers), refname) refres = list(ref.con.execute(query).next()) author, title, year, filename, bibtex = refres bibdic = ref.parse_bibtex(bibtex) author = bibdic['author'] # include firstnames abstract = '' #refname = getrefname(bibtex) print refname if arxivid: arxivres = arxiv.query(id_list=[arxivid])[0] author = ', '.join(arxivres['authors']) abstract = arxivres['summary'] assert filter(str.isalnum, str(arxivres['title'])).lower() == filter(str.isalnum, title).lower() # yaml long strings with gt (>) then indented lines. abstract = '\n'.join(' '+li for li in abstract.split('\n')) ###### WRITE YAML ALWAYS yaml = """--- title: "{}" author: "{}" journal: "{}" year: {} arxiv: "{}" shortname: {} thumbnail: /{}/{}.png excerpt: ""
def found(): branch = request.form['engine'] #ieee or arxiv keyword = request.form['keyword'] noofresults = request.form['number'] noofresults = int(noofresults) print('hello') branch = branch.lower() if branch == 'arxiv': print('hi') result = arxiv.query(query=keyword, max_results=noofresults) data = pd.DataFrame( columns=["Title", 'Published Date', 'Download Link']) for i in range(len(result)): title = result[i]['title'] arxiv_url = result[i]['arxiv_url'] arxiv_url = arxiv_url.replace('abs', 'pdf') published = result[i]['published'] data_tmp = pd.DataFrame( { "Title": title, "Published Date": published, "Download Link": arxiv_url }, index=[0]) data = pd.concat([data, data_tmp]).reset_index(drop=True) #dataframe return render_template('searchengine.html', tables=[ data.to_html( render_links=True, classes=['table table-bordered']) ]) elif branch == 'scholar': headings = [] links = [] def getdata(url): headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0" } r = requests.get(url, headers=headers) return r.text if __name__ == "__main__": search = keyword noOfLinks = noofresults x = search.split(" ") search = "+".join(x) i = 0 while i < 1000: data = getdata( f"https://scholar.google.com/scholar?start={i}&q={search}") soup = BeautifulSoup(data, "lxml") # print(soup.prettify()) terms = soup.findAll('a') for term in terms: if ".pdf" in term["href"]: links.append(term['href']) for term in terms: if ".pdf" in term["href"]: x = soup.findAll( 'a', attrs={"data-clk-atid": term["data-clk-atid"]}) for y in x: z = y.getText() if "[PDF]" in z: pass else: headings.append(z) i += 10 d = {"Title": headings, "PDFLink": links} df = pd.DataFrame(d) print("Titles and PDF links are as follow:") if df.index.stop >= noOfLinks: fdf = df[:noOfLinks] else: fdf = df return render_template('searchengine.html', tables=[ fdf.to_html( render_links=True, classes=['table table-bordered']) ], current_user=current_user) elif branch == 'nature': headings = [] alinks = [] abstracts = [] datePublished = [] authors = [] # Get data from website def getdata(url): r = requests.get(url) return r.text if __name__ == "__main__": # Getting initial soup search = keyword noOfResults = noofresults x = search.split(" ") search = "+".join(x) data = getdata(f"https://www.nature.com/search?q={search}") soup = BeautifulSoup(data, "lxml") # Getting Headings terms = soup.findAll('span', attrs={"class": "visually-hidden"}) terms = terms[6:] terms = terms[:-9] for term in terms[:noOfResults]: term = term.getText()[12:] term = term[:-49] headings.append(term) # Getting Links links = soup.findAll('a', attrs={"data-track-action": "search result"}) for link in links[:noOfResults]: alinks.append(f"https://www.nature.com{link['href']}") # Getting abstracts, publication dates and authors for alink in alinks[:noOfResults]: dt = getdata(alink) sp = BeautifulSoup(dt, "lxml") tm = sp.find('div', attrs={"class": "c-article-section__content"}) date = sp.find('time', attrs={"itemprop": "datePublished"}) author = sp.findAll('a', attrs={"data-test": "author-name"}) if len(author) > 0: auth = [] for autho in author: auth.append(autho.getText()) authors.append(auth) else: authors.append("Authors not mentioned") if date: datePublished.append(date.getText()) else: datePublished.append("Date Published not mentioned") if tm: tm = tm.find("p").getText() abstracts.append(tm) else: abstracts.append("Abstract Not Available") # Creating dictionary of the lists d = { "Heading": headings, "ArticleLink": alinks, "ArticleAbstract": abstracts, "PublicationDate": datePublished, "Authors": authors } # Creating dataframe df = pd.DataFrame(d) return render_template('searchengine.html', tables=[ df.to_html( render_links=True, classes=['table table-bordered']) ], current_user=current_user) elif branch == 'ieee': titles_list = [] links_list = [] date_list = [] abstract_list = [] citation_list = [] abstract_list = [] author_list = [] page_no = 1 no = math.ceil(noofresults / 25) for page_no in range(1, no + 1): headers = { "Accept": "application/json, text/plain, */*", "Origin": "https://ieeexplore.ieee.org", "Content-Type": "application/json", } payload = { "newsearch": True, "queryText": keyword, "highlight": True, "returnFacets": ["ALL"], "returnType": "SEARCH", "pageNumber": page_no } r = requests.post("https://ieeexplore.ieee.org/rest/search", json=payload, headers=headers) page_data = r.json() for record in page_data["records"]: titles_list.append(record["articleTitle"]) links_list.append('https://ieeexplore.ieee.org' + record["documentLink"]) date_list.append(record["publicationDate"]) citation_list.append(record["citationCount"]) key = 'abstract' if key in record: abstract_list.append(record['abstract']) else: abstract_list.append('Abstract not present') d = { "Title": titles_list, "Abstract": abstract_list, "Link": links_list, "Publication Date": date_list, "No of Citations": citation_list } df = pd.DataFrame.from_dict(d) finaldf = df[:noofresults] #dataframe return render_template('searchengine.html', tables=[ finaldf.to_html( render_links=True, classes=['table table-bordered']) ], current_user=current_user)
columns=[ "Date", "Time", "Location", "#", "Session", "Session Title", "Paper ID", "Paper Title", "Authors", "Title-arxiv", "Abstract", "URL" ]) pa_table = soup.select("table")[3] index = 0 for i in range(len(pa_table.find_all('td'))): i_column = i % 9 if pa_table.find_all('td')[i].string is not None: df.iloc[index, i_column] = pa_table.find_all('td')[i].string if i_column == 8: title = df.iloc[index, 7] tokens = title.replace(":", "").replace("-", " ").strip().split() s = tokens[0] + "+AND+all:" + "+AND+all:".join(tokens[1:]) try: results = arxiv.query(s, prune=True, start=0, max_results=1) except: pass if len(results) != 0: df.iloc[index, 9] = results[0]['title'].replace("\n ", "") df.iloc[index, 10] = results[0]['summary'].replace("\n", " ") df.iloc[index, 11] = results[0]['pdf_url'] print(str(index) + "/783 done") index += 1 df.to_csv('paper_lists.csv')
#Directory to save papers in dest = str(sys.argv[3]) dest = dest.replace("'", "") if dest[-1] != "/": dest = dest + "/" #Maximum number of papers to download max_len = int(sys.argv[4]) #Search within papers search_pdf = str(sys.argv[5]) search_pdf = search_pdf.replace("'", "") search_pdf = search_pdf.replace(",", " AND") result = arxiv.query(query=search_arxiv, max_results=max_len) j = 1 for paper in result: #for each paper found in the search if 'links' in paper: #if the paper has the attribute 'links' links = paper['links'] for i in links: #for every link if i['type'] == 'application/pdf': #if that link is a pdf h = i['href'] #save the href of that link in 'h' file = "pdf" + str(j) + ".pdf" #creating the file name dest_file = dest + file #concatenating the filename with the fimepath input by user wget.download(h, dest_file) #download the pdf to the directory j = j + 1
if not bmask: try: links = get_links_ads(bibcode, q='arxiv') except: # HttpError: print("rate limited ", time.localtime()) time.sleep(300) continue if links: arxivid = links[0].split('/')[-1] aid = arxivid.split(':')[-1] try: # search with api paper = arxiv.query(id_list=[aid])[0] arxiv.download(paper) #,prefer_source_tarfile=True) pdffile = glob.glob("{}*.pdf".format(aid))[0] except: try: section = links[0].split('/')[-2].split(':')[-1] url = "https://arxiv.org/pdf/{}/{}.pdf".format( section, aid) pdffile = "{}.pdf".format(aid) urllib.request.urlretrieve(url, pdffile) except: import pdb pdb.set_trace() continue # parse pdf
''' import arxiv import json predate = '2018-12-19' _date = '2018-12-20' # 14853 2018-12-13 with open('log.json', 'r') as f: data = json.load(f) _start = data['log'][predate]['start'] + data['log'][predate]['cnt'] cnt = 0 paper = arxiv.query(search_query='cat:cs.CV', start=int(_start), max_results=50) print(type(paper), len(paper), paper) md = '# Latest CV paper updated in ' + _date for item in paper: if item['updated'][:10] == _date: cnt += 1 downurl = item['id'] title = item['title_detail']['value'].replace('\n', ' ') author = ','.join(item['authors']) summary = item['summary'].replace('\n', ' ') md += '\n' md += '\n' md += '#### {_order}. {_title}'.format(_order=cnt, _title=title)
import arxiv import json from bibtexparser.bwriter import BibTexWriter from bibtexparser.bibdatabase import BibDatabase with open('./params.json') as json_file: params = json.load(json_file) search_string = ' AND '.join( '( ' + ' OR '.join([f'"{item}"' for item in ors]) + ' )' for ors in params["stage1"]) print(search_string) # Multi-field queries result = arxiv.query(search_query=search_string, max_results=100) print(result[0]) biblist = [] for item in result: bibitem = {} bibitem['ENTRYTYPE'] = 'article' bibitem['ID'] = item['id'] bibitem['abstract'] = item['summary'] bibitem['title'] = item['title'] bibitem['journal'] = 'arxiv' bibitem['author'] = ' and '.join( [', '.join(author.rsplit(' ', 1)[::-1]) for author in item['authors']]) bibitem['year'] = str(item['published_parsed'].tm_year) bibitem['month'] = str(item['published_parsed'].tm_mon) bibitem['url'] = item['pdf_url'] biblist.append(bibitem) db = BibDatabase()
def main(url, lang, small, path): if not (lang in ['JA', 'RU', 'PL', 'NL', 'IT', 'PT', 'ES', 'FR', 'DE']): print('Error: NOT SUPPORT LANGUAGE', file=sys.stderr) sys.exit(1) else: print('Mode: {}'.format(lang)) if url != 'None': arXiv_id = url[22:] result_list = arxiv.query(id_list=[arXiv_id], max_results=1) if len(result_list) < 1: print('Error: NOT FOUND PAPER', file=sys.stderr) sys.exit(1) else: print('Done: Found paper') result = result_list[0] # Prepare paper summary Summary = {} Summary["title"] = result.title.replace("\n", " ") Summary["author"] = result.author Summary["arxiv_url"] = result.arxiv_url Summary["pdf_url"] = result.pdf_url Summary["date"] = result.updated Summary["abstract"] = result.summary.replace("-\n", "").replace("\n", " ").replace(". ", ".\n") Summary_JP = {} Summary_JP["title"] = tr.traslateBydeepL(result.title.replace("\n", " "), lang) Summary_JP["author"] = result.author Summary_JP["arxiv_url"] = result.arxiv_url Summary_JP["pdf_url"] = result.pdf_url Summary_JP["date"] = result.updated Summary_JP["abstract"] = tr.traslateBydeepL(result.summary.replace("-\n", "").replace("\n", " ").replace(". ", ".\n"), lang) # Download PDF def PDFdownload(url, title): urllib.request.urlretrieve(url, "{0}".format(title)) now_path = os.path.dirname(os.path.abspath(__file__)) data_path = now_path + '/data/{}'.format(arXiv_id.replace("/", "").replace(".", "")) os.makedirs(data_path, exist_ok=True) pdf_url = result.pdf_url pdf_title = ''.join([data_path, '/', arXiv_id.replace("/", "").replace(".", ""), '.pdf']) PDFdownload(pdf_url, pdf_title) # Correct documents in PDF # Chapter Class class Chapter: def __init__(self, title, pagenum): self.title = title self.body = '' self.pagenum = pagenum return def getTitle(self): return self.title def getPagenum(self): return self.pagenum def addBody(self, text): self.body = ''.join([self.body, text]) def getBody(self): return self.body # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。 laparams = LAParams(detect_vertical=True) # 共有のリソースを管理するリソースマネージャーを作成。 resource_manager = PDFResourceManager() # ページを集めるPageAggregatorオブジェクトを作成。 device = PDFPageAggregator(resource_manager, laparams=laparams) # Interpreterオブジェクトを作成。 interpreter = PDFPageInterpreter(resource_manager, device) Chapters = [] nowC = Chapter('Abstract', 1) print() print('-' * 30) # 読みやすいよう区切り線を表示する。 print('CHAPTER LIST') print('-' * 30) # 読みやすいよう区切り線を表示する。 with open(pdf_title, 'rb') as f: # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。 # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。 for page_num, page in enumerate(PDFPage.get_pages(f)): interpreter.process_page(page) # ページを処理する。 layout = device.get_result() # LTPageオブジェクトを取得。 # ページ内のテキストボックスのリストを取得する。 boxes = pr.find_textboxes_recursively(layout) # テキストボックスの左上の座標の順でテキストボックスをソートする。 # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。 # boxes.sort(key=lambda b: (b.x0, -b.y1)) emp = " " piriod = ". " chainp = ".. " for box in boxes: pdf_text = box.get_text() # 整形処理 # タイトル入っちゃってるやつ消す pdf_text = pdf_text.replace(Summary["title"] + '\n', "") # 単語の途中で改行対処(普通の単語に直す) pdf_text = pdf_text.replace("-\n", "") # 単語の間や文末に改行消去 pdf_text = pdf_text.replace("\n", " ") # figのピリオド対処 pdf_text = pdf_text.replace("fig.", "fig").replace("Fig.", "Fig") # 3以上の長さのスペース入り連続ピリオドを圧縮 for i in range(3, 7): pdf_text = pdf_text.replace(piriod * i, "...") pdf_text = pdf_text.replace(chainp, "..") # タイトルっぽいやつのピリオドスペースは消しとく if nowC.getTitle() != 'References' and nowC.getTitle() != 'REFERENCES': for i in range(1, 10): pdf_text = re.sub('{}\. '.format(i), '{} '.format(i), pdf_text) # ピリオドスペースで改行 pdf_text = pdf_text.replace(". ", ".\n") # どんな長さのスペースも1つのスペースに for i in range(1, 7): pdf_text = pdf_text.replace(emp * i, " ") if re.findall( '^§?[A-Z]\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^Acknowledgements*|^Acknowledgments*|^ACKNOWLEDGMENTS*|^References*|^REFERENCES*|^Introduction*|^INTRODUCTION*', pdf_text) and len(pdf_text) > 8: # Subtitleの処理 print(pdf_text) Chapters.append(nowC) nowC = Chapter(re.findall( '^§?[A-Z]\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[A-Z]\.[1-9]\.[1-9]+\.? [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^§?[1-9]+\.[1-9]+\.[1-9]+ [^\+−≤≥<>∧∨==∈×;∇/:]+$|^Acknowledgements*|^Acknowledgments*|^ACKNOWLEDGMENTS*|^References*|^REFERENCES*|^Introduction*|^INTRODUCTION*', pdf_text)[0], page_num + 1) elif re.findall('^arXiv:*', pdf_text) and nowC.getTitle() != 'References' and nowC.getTitle() != 'REFERENCES' and nowC.getTitle() != 'References ': # arxiv_infoの処理 arxiv_info = re.findall('^arXiv:*', pdf_text) else: if re.findall('^References*|^REFERENCES*', nowC.getTitle()): # 参考文献の処理 pdf_text = pdf_text.replace("\n", " ") # .で終わってたら改行する if len(pdf_text) > 2 and pdf_text[-2] == '.': pdf_text += ('\n') else: if len(pdf_text) < 10: # 短すぎるやつは多分数式とかなので改行を消す pdf_text.replace("\n", " ") nowC.addBody(pdf_text) Chapters.append(nowC) # 翻訳概要表示 print('-' * 30) # 読みやすいよう区切り線を表示する。 print('日本語概要') for k, v in Summary_JP.items(): print('-' * 30) # 読みやすいよう区切り線を表示する。 if k == 'abstract': print('[概要]') print(v) else: print('{} : {}'.format(k, v)) # 導入と結論だけ早めに出しちゃう translated_text_conclusion = '' translated_text_introduction = '' for c in Chapters: if re.findall('Conclusion*|CONCLUSION*', c.getTitle()): print('-' * 30) # 読みやすいよう区切り線を表示する。 print('[結論]') transtext = "" transtext_list = c.getBody().split(sep='\n') ind = 0 while ind < len(transtext_list): while ind < len(transtext_list) and len(transtext) < 3000: transtext = ''.join([transtext, transtext_list[ind], '\n']) ind += 1 translated_text_conclusion = ''.join([translated_text_conclusion, tr.traslateBydeepL(transtext, lang)]) transtext = '' print(translated_text_conclusion) elif re.findall('Introduction*|INTRODUCTION*', c.getTitle()): print('-' * 30) # 読みやすいよう区切り線を表示する。 print('[導入]') transtext = "" transtext_list = c.getBody().split(sep='\n') ind = 0 while ind < len(transtext_list): while ind < len(transtext_list) and len(transtext) < 3000: transtext = ''.join([transtext, transtext_list[ind], '\n']) ind += 1 translated_text_introduction = ''.join([translated_text_introduction, tr.traslateBydeepL(transtext, lang)]) transtext = '' print(translated_text_introduction) # smallだったらここで打ち切り if small: print('-' * 30) # 読みやすいよう区切り線を表示する。 print('Thank you !') return # 本文翻訳 print('-' * 30) # 読みやすいよう区切り線を表示する。 Chapters_JP = [] nowC = Chapter('Abstract', 1) for c in Chapters: nowC = Chapter(tr.traslateBydeepL(c.getTitle(), lang), c.getPagenum()) transtext = "" translated_text = "" transtext_list = c.getBody().split(sep='\n') ind = 0 if c.getTitle() != 'References' and c.getTitle() != 'REFERENCES': if re.findall('Conclusion*|CONCLUSION*', c.getTitle()): nowC.addBody(translated_text_conclusion) elif re.findall('Introduction*|INTRODUCTION*', c.getTitle()): nowC.addBody(translated_text_introduction) else: while ind < len(transtext_list): while ind < len(transtext_list) and len(transtext) < 3000: transtext = ''.join([transtext, transtext_list[ind], '\n']) ind += 1 translated_text = ''.join([translated_text, tr.traslateBydeepL(transtext, lang)]) transtext = '' nowC.addBody(translated_text) else: nowC.addBody(c.getBody()) Chapters_JP.append(nowC) print('Done Translating : Chapter {}'.format(c.getTitle())) print('-' * 30) # 読みやすいよう区切り線を表示する。 # 出力作業 # 出力用のマークダウンファイル en_md = open('{}/output_{}_EN.md'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w') jp_md = open('{}/output_{}_JP.md'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w') # 目次マーカー付与 en_md.write('[TOC]\n\n') jp_md.write('[TOC]\n\n') # 英語概要 for k, v in Summary.items(): for i in range(1, 30): v = re.sub('\[{}\]'.format(i), '\[\^{}\]'.format(i), v) if k == "arxiv_url" or k == "pdf_url": if k == "arxiv_url": en_md.write(''.join(['## ', 'URL'])) en_md.write('\n') else: en_md.write('\n') en_md.write(''.join([k, ' : ', '[', v, ']', '(', v, ')'])) else: if k == "title": en_md.write(''.join(['# ', v])) else: en_md.write(''.join(['## ', k])) en_md.write('\n') en_md.write(v.replace("\n", "<br>")) en_md.write('\n') # 訳語概要 for k, v in Summary_JP.items(): for i in range(1, 30): v = re.sub('\[{}\]'.format(i), '\[\^{}\]'.format(i), v) if k == "arxiv_url" or k == "pdf_url": if k == "arxiv_url": jp_md.write(''.join(['## ', 'URL'])) jp_md.write('\n') else: jp_md.write('\n') jp_md.write(''.join([k, ' : ', '[', v, ']', '(', v, ')'])) else: if k == "title": jp_md.write(''.join(['# ', v])) else: jp_md.write(''.join(['## ', k])) jp_md.write('\n') jp_md.write(v.replace("\n", "<br>")) jp_md.write('\n') # 英語本文 for c in Chapters[1:]: if c.title[1] == '.': en_md.write(''.join(['### ', c.title, ' --------- P.{}'.format(c.pagenum)])) else: en_md.write(''.join(['## ', c.title, ' --------- P.{}'.format(c.pagenum)])) en_md.write('\n') if c.title != 'References' and c.title != 'References ' and c.title != 'REFERENCES': for i in range(1, 30): c.body = re.sub('\[{}\]'.format(i), '[^{}]'.format(i), c.body) else: for i in range(1, 30): c.body = re.sub('\[{}\]'.format(i), '[^{}]:'.format(i), c.body) en_md.write(c.body.replace("\n", "<br>")) en_md.write('\n') # 訳語本文 for c in Chapters_JP[1:]: if len(c.title) > 1 and c.title[1] == '.': jp_md.write(''.join(['### ', c.title, ' --------- P.{}'.format(c.pagenum)])) else: jp_md.write(''.join(['## ', c.title, ' --------- P.{}'.format(c.pagenum)])) jp_md.write('\n') if c.title != '参考文献': for i in range(1, 30): c.body = re.sub('\[{}\]'.format(i), '[^{}]'.format(i), c.body) else: for i in range(1, 30): c.body = re.sub('\[{}\]'.format(i), '[^{}]:'.format(i), c.body) jp_md.write(c.body.replace("\n", "<br>")) jp_md.write('\n') en_md.close() jp_md.close() # Output HTML from MarkDown import markdown md = markdown.Markdown(extensions=['admonition', 'toc', 'footnotes']) with open('{}/output_{}_EN.md'.format(data_path, arXiv_id.replace("/", "").replace(".", ""))) as fen: with open('{}/output_{}_EN.html'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w') as hen: md_en = fen.read() body = md.convert(md_en) # HTML書式に合わせる html = '<html lang="ja"><meta charset="utf-8"><body>' html += (body + '</body></html>') hen.write(html) with open('{}/output_{}_JP.md'.format(data_path, arXiv_id.replace("/", "").replace(".", ""))) as fjp: with open('{}/output_{}_JP.html'.format(data_path, arXiv_id.replace("/", "").replace(".", "")), 'w') as hjp: md_jp = fjp.read() body = md.convert(md_jp) # HTML書式に合わせる html = '<html lang="ja"><meta charset="utf-8"><body>' html += (body + '</body></html>') hjp.write(html) # htmlファイルをブラウザ表示 import webbrowser jp_url = 'file://{}/output_{}_JP.html'.format( data_path, arXiv_id.replace("/", "").replace(".", "")) en_url = 'file://{}/output_{}_EN.html'.format( data_path, arXiv_id.replace("/", "").replace(".", "")) webbrowser.open_new(jp_url) webbrowser.open_new(en_url) print('-' * 30) # 読みやすいよう区切り線を表示する。 print('Thank you !') elif path != 'None': ep.translate(path, small, lang) else: print('Please use -u option or -p option.\ndetail : https://pypi.org/project/eigoyurusan/')
def download_arxiv(url, output): papers = arxiv.query(id_list=[url.split('/')[-1]]) paper = papers[0] if len(papers) > 0 else None if paper is None: return None arxiv.download(paper, slugify=lambda x: output) return paper
import arxiv # Get an interator over query results result = arxiv.query( query="GAN", max_chunk_results=10, max_results=1, iterative=True ) for paper in result(): #print(paper) pass #print(paper) print(paper['title']) print(paper['summary']) print(paper['arxiv_url']) for l in paper: print(l) #print('\n')
QUERY = "cs."+ category[rand_cat] print(rand_cat,category[rand_cat]) QUERY = "cs.AI OR cs.CV" for i in category: QUERY += "OR cs.{}".format(i) # QUERY = "cs.AI" dt = datetime.now().strftime("%Y%m%d") dt = str(int(dt)-2) # dt=str(20201130) #デバック用(ここは2020というように4桁表示) print(str(int(dt))) # translator = Translator() # result_list = arxiv.query(query = 'cat:cs.AI AND submittedDate:[20201223000001 TO 20201223235959]',max_results=2) result_list = arxiv.query(query = 'cat:{} AND submittedDate:[{}000001 TO {}235959]'.format(QUERY,dt,dt),max_results=50,sort_by='submittedDate') # print(result_list[1]) def translate(text): tr = Translator(service_urls=['translate.googleapis.com']) while True: try: text_ja = tr.translate(text,src="en", dest="ja").text return text_ja break except Exception as e: tr = Translator(service_urls=['translate.googleapis.com']) def translate_post(df): title_jpn = translate(title) abst_jpn = translate(abst)
# url is arxiv url # id is arxiv id df = pd.read_csv("files_with_url.csv") return df def get_notes(df) notes = [] for item in df.iterrows(): url = item[1].url file = item[1].File arxiv_id = item[1].id with open(f"{NOTES_DIR}{file}") as file_: file_contents = file_.read() print(file, arxiv_id) arxiv_obj = arxiv.query(id_list = [str(arxiv_id)])[0] pdf_url = url.replace("abs","pdf") + ".pdf" notes.append(dict(url=url, pdf_url=pdf_url, arxiv_id=arxiv_id, file_contents=file_contents,arxiv_obj=arxiv_obj)) return notes def complete_notes(notes): for note in notes: note["authors"] = note["arxiv_obj"]['authors'] note["published"] = note["arxiv_obj"]["published"][:7] note["abs"] = note["arxiv_obj"]["summary_detail"]["value"].replace("\n", " ") note["notes"] = note["file_contents"].split("\n\n")[1:-1] note["tags"] = [tag.capitalize() for tag in note["file_contents"].split("\n\n")[-1].replace("#", "").replace("\n", "").replace("_", " ").split(", ")] note["title"] = note["arxiv_obj"]['title'].replace("\n", " ").replace(" ", " ").replace(" ", " ") return notes
# conn_matrix[i,:] = temp # # # fig = plt.figure(figsize=(10,5)); # # plt.spy(conn_matrix, marker ='s', color='chartreuse', markersize=5); # # plt.xlabel('Authors'); # # plt.ylabel('Articles'); # # plt.title('Authors of the articles', fontweight='bold'); # # return conn_matrix search_query = str(sys.argv[1]) max_results = int(sys.argv[2]) results = arxiv.query(search_query=search_query, start=0, max_results=max_results, sort_by="submittedDate", sort_order="descending") title, authors, date, summary, tags, pdf_url = store_results(results) cat_counts = categorize_tags(tags) author_counts = categorize_authors(authors) cat_histogram(cat_counts, search_query, max_results) author_histogram(author_counts, search_query, max_results) unique_authors = get_unique_authors(authors)
import arxiv results = arxiv.query(query="learning analytics education", max_results=1000) print("Found ", len(results), " papers.") #print(results) for paper in results: print("Downloading: ", paper['title'], " ", paper['published']) #arxiv.download(paper)
if 'GATEWAY_INTERFACE' in os.environ: args = cgi_to_dict() else: args = dict(arg.split('=') for arg in sys.argv[1:]) # ## Main # start = int(args.get('start', 0)) length = int(args.get('length', 50)) query = args.get('query', QUERY) res = arxiv.query(query=query, max_results=length, sort_by="submittedDate", sort_order="descending", start=start) print(f""" <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>ArXiv search results</title> <link rel="stylesheet" href="https://arxiv.org/css/arXiv.css"> </head> <body class="with-cu-identity"> <div id="header"> <h1>arXiv.org – {query} – start={start}</h1> </div>""")
#!/usr/bin/env python import pprint #import requests import arxiv import pandas as pd l = arxiv.query(query='au:"Grisha Perelman"') #print(type(l)) #print(type(l[0])) #pprint.pprint(l[0], width=200) print("\nauthor:\n" + l[0]['author']) print("\ntitle:\n" + l[0]['title']) print("\narxiv_url:\n" + l[0]['arxiv_url']) print("\npdf_url:\n" + l[0]['pdf_url']) print("\nsummary:\n" + l[0]['summary']) #response = requests.post("http://localhost:3000/paper/create/") #print(response.status_code) #print(response.text)
import arxiv import pprint results = arxiv.query('residual') pp = pprint.PrettyPrinter(indent=1) for i, item in enumerate(results): title = item['title'] category = item['arxiv_primary_category']['term'] summary = item['summary'] published = item['published'] updated = item['updated'] print(title, category)
def arxiv_random(message): user_action_log(message, "made arxiv random query") try: eastern = pytz.timezone('US/Eastern') eastern_time = datetime.datetime.now(eastern) # publications on 20:00 if eastern_time.hour < 20: eastern_time -= datetime.timedelta(days=1) # no publications on friday and saturday if eastern_time.weekday() == 5: eastern_time -= datetime.timedelta(days=2) elif eastern_time.weekday() == 4: eastern_time -= datetime.timedelta(days=1) last_published_date = eastern_time.strftime("%Y-%m-%d") response = requests.get('http://export.arxiv.org/oai2', params={'verb' : 'ListIdentifiers', 'set' : 'math', 'metadataPrefix': 'oai_dc', 'from' : last_published_date}) action_log("Random arxiv paper since {}".format(last_published_date)) # если всё хорошо if response.status_code == 200: response_tree = ElementTree.fromstring(response.content) num_of_papers = len(response_tree[2]) paper_index = random.randint(0, num_of_papers) paper_arxiv_id = response_tree[2][paper_index][0].text.split(':')[-1] # hardcoded papep_obj = arxiv.query(id_list=[paper_arxiv_id])[0] paper_link = papep_obj['pdf_url'].replace('http://', 'https://') + '.pdf' paper_link_name = paper_link.split("/pdf/")[1] print(paper_link) print(paper_link_name) req_pdf_size = requests.head(paper_link) pdf_size = round(int(req_pdf_size.headers["Content-Length"]) / 1024 / 1024, 2) a_name = papep_obj['authors'][0] if len(papep_obj['authors'])>1: a_name += 'et al.' query_answer = '{}. <a href="{}">{}</a>. {}\n\n— <a href="{}">{}</a>, {} Мб\n'.format( a_name, papep_obj['arxiv_url'], escape(papep_obj['title'].replace('\n', ' ')), escape(papep_obj['summary'].replace('\n', ' ')), paper_link, paper_link_name, pdf_size ) my_bot.reply_to(message, query_answer, parse_mode="HTML", disable_web_page_preview=False) user_action_log(message, "arxiv random query was successful: " "got paper {}".format(papep_obj['arxiv_url'])) # TODO(randl): doesn't send. Download and delete? # my_bot.send_document(message.chat.id, data=paper_link) elif response.status_code == 503: # слишком часто запрашиваем action_log("Too much queries. 10 minutes break should be enough") arxiv_checker.last_call = datetime.datetime.utcnow() - datetime.timedelta(seconds=610) else: # если всё плохо user_action_log(message, "arxiv random query failed: response {}".format(response.status_code)) except Exception as ex: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] action_log("Unknown Exception: {}: {}\nat {} line {}".format(exc_type, ex, fname, exc_tb.tb_lineno))
import arxiv from py2neo import Graph client = MongoClient('mongodb', 27017) db = client['arxiv'] Papers = db["Papers"] Papers.drop() uri = "http://neo4j:7474" password = "******" graph = Graph(uri, password=password) graph.run("MATCH (n) DETACH DELETE n") ## insersion base for paper in arxiv.query(query="quantum", max_results=1000): Papers.insert_one(paper) cursor_paper = Papers.find() for paper in cursor_paper: create_paper = "CREATE (p:PAPER {id: '%s' })" % paper["id"] match_authors = "" link_authors = "" for i, author in enumerate(paper["authors"]): match_authors += "MERGE (u%s:AUTHOR {name:\"%s\"}) \n" % (i, author) link_authors += "MERGE (u%s)-[:AUTHORED {author_rank: %s}]->(p) \n" % ( i, i) match_tags = "" link_tags = "" for i, tag in enumerate(paper["tags"]):