def test_sleep_multiple_requests(self, patched_time_sleep): client = arxiv.Client(page_size=1) url1 = client._format_url(arxiv.Search(query="quantum"), 0, 1) url2 = client._format_url(arxiv.Search(query="testing"), 0, 1) # Rate limiting is URL-independent; expect same behavior as in # `test_sleep_standard`. client._parse_feed(url1) patched_time_sleep.assert_not_called() client._last_request_dt = datetime.now() client._parse_feed(url2) patched_time_sleep.assert_called_once_with( approx(client.delay_seconds, rel=1e-3))
def dbsearch_arxiv( data: list[dict], silent: bool = False, ) -> dict: """ Check online databases (can be slow!). :param silent: Hide status bar. :return: Dictionary with discovered items. """ output = defaultdict(lambda: defaultdict(list)) # find arxivid based on journal doi for entry in tqdm.tqdm(data, disable=silent): iden = get_identifiers(entry) if "arxivid" in iden: continue if "doi" not in iden: continue doi = iden["doi"] ret = [] for result in arxiv.Search(query=f'"{doi}"').results(): ret.append( re.sub(r"(http)(s?)(://arxiv.org/abs/)(.*)", r"\4", result.entry_id)) for i in ret: output[entry["ID"]]["arxivid"].append(i) # arXiv preprint: check if journal id is present # possible optimisation: can be made faster to bluntly skip all entries that have a doi for entry in tqdm.tqdm(data, disable=silent): iden = get_identifiers(entry) if "arxivid" not in iden: continue ret = [] for result in arxiv.Search(id_list=[iden["arxivid"]]).results(): ret.append(result.doi) if "doi" in iden: ret = [i for i in ret if i != iden["doi"]] for i in ret: output[entry["ID"]]["doi"].append(i) for key in output: output[key] = dict(output[key]) return dict(output)
def _get_queries_from_last_day(self, max_results=100): queries = [] # get all queries in the categories in the last day for category in self._categories: num_category_added = 0 new_queries = [Query(q) for q in arxiv.Search(query=category, sort_by=arxiv.SortCriterion.SubmittedDate, max_results=max_results).get()] num_category_added += len(new_queries) queries += [q for q in new_queries if q.is_recent] # get rid of duplicates queries_dict = {q.id: q for q in queries} unique_keys = set(queries_dict.keys()) queries = [queries_dict[k] for k in unique_keys] # only keep queries that contain keywords queries = [q for q in queries if max([k.lower() in str(q).lower() for k in self._keywords])] # sort from most recent to least queries = sorted(queries, key=lambda q: (datetime.now(timezone('GMT')) - q.date).total_seconds()) # filter if previously sent prev_arxivs = self._get_previously_sent_arxivs() queries = [q for q in queries if q.id not in prev_arxivs] self._save_previously_sent_arxivs(queries) return queries
def test_get_short_id(self): result_id = "1707.08567" result = next(arxiv.Search(id_list=[result_id]).get()) got = result.get_short_id() self.assertTrue(got.startswith(result_id)) # Should be of form `1707.08567v1`. self.assertTrue(re.match(r'^{}v\d+$'.format(result_id), got))
def test_query_page_count(self): client = arxiv.Client(page_size=10, delay_seconds=0) client._parse_feed = MagicMock(wraps=client._parse_feed) generator = client.get(arxiv.Search(query="testing", max_results=55)) results = [r for r in generator] self.assertEqual(len(results), 55) self.assertEqual(client._parse_feed.call_count, 6)
def is_valid_arxiv_id(arxiv_id: str) -> bool: search = arxiv.Search(id_list=[arxiv_id]) try: next(search.results()) return True except: return False
def test_result_shape(self): max_results = 100 search = arxiv.Search("testing", max_results=max_results) results = [r for r in search.get()] self.assertEqual(len(results), max_results) for result in results: self.assert_valid_result(result)
def run(self): # search arXiv database try: search = arxiv.Search(id_list=[self.arguments[1]]) paper = next(search.results()) except Exception: return [] # generate journal link nodes ret_node = nodes.paragraph() journal = "" if paper.journal_ref: journal += f", {paper.journal_ref}, " if paper.doi: journal += f"doi: {paper.doi}" ret_node += nodes.Text(f"[{self.arguments[0]}] ") ret_node += nodes.Text(", ".join([author.name for author in paper.authors]) + ", ") ret_node += nodes.emphasis(text=f"{paper.title}") if journal: ret_node += nodes.Text(journal) ret_node += nodes.Text(" ") ret_node += nodes.reference(text="(open)", refuri=paper.pdf_url) return [ret_node]
def get_accurate_name_from_arxiv(paper_title: str): # arxiv query by paper title is shitty # we use google search to get the arxiv_id arxiv_url = list(search(f'{paper_title} site:arxiv.org', stop=1))[0] arxiv_id = re.findall(r'\d+\.\d+', arxiv_url)[0] paper = arxiv.Search(id_list=[arxiv_id])[0] return paper['title']
def gen_record(document_id, primary_doc, gen_links): """Generate record from arxiv url. # example document_id: https://arxiv.org/abs/1810.04805 arxiv reference: https://arxiv.org/help/api/user-manual#_calling_the_api # api url = 'http://export.arxiv.org/api/query?id_list=1311.5600' """ paper_id = document_id.split("abs/")[-1] search = arxiv.Search(id_list=[paper_id]) result = next(search.get()) record = gen_arxiv_record_from_result(result, primary_doc=primary_doc) return record
def find_in_arxiv(paper_url): paper_id = paper_url.split('/')[-1] if 'pdf' in paper_id: paper_id = paper_id[:-4] paper = next(arxiv.Search(id_list=[paper_id]).get()) year = paper.published.year authors = [{"name": i.name} for i in paper.authors] title = paper.title return year, authors, title, paper_id
def search_arxiv(queries, max_results=100): ''' This function will search arxiv associated to a set of queries and store the latest 10000 (max_results) associated to that search. params: queries (List -> Str) : A list of strings containing keywords you want to search on Arxiv max_results (Int) : The maximum number of results you want to see associated to your search. Default value is 1000, capped at 300000 returns: This function will return a DataFrame holding the following columns associated to the queries the user has passed. `title`, `date`, `article_id`, `url`, `main_topic`, `all_topics` example: research_df = search_arxiv( queries = ['automl', 'recommender system', 'nlp', 'data science'], max_results = 10000 ) ''' d = [] searches = [] # hitting the API for query in queries: search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate, sort_order=arxiv.SortOrder.Descending) searches.append(search) # Converting search result into df for search in searches: for res in search.results(): data = { 'title': res.title, 'date': res.published, 'article_id': res.entry_id, 'url': res.pdf_url, 'main_topic': res.primary_category, 'all_topics': res.categories, 'authors': res.authors } d.append(data) d = pd.DataFrame(d) d['year'] = pd.DatetimeIndex(d['date']).year # change article id from url to integer unique_article_ids = d.article_id.unique() article_mapping = {art: idx for idx, art in enumerate(unique_article_ids)} d['article_id'] = d['article_id'].map(article_mapping) return d
def test_sleep_standard(self, patched_time_sleep): client = arxiv.Client(page_size=1) url = client._format_url(arxiv.Search(query="quantum"), 0, 1) # A client should sleep until delay_seconds have passed. client._parse_feed(url) patched_time_sleep.assert_not_called() # Overwrite _last_request_dt to minimize flakiness: different # environments will have different page fetch times. client._last_request_dt = datetime.now() client._parse_feed(url) patched_time_sleep.assert_called_once_with( approx(client.delay_seconds, rel=1e-3))
def dl_paper(self): """Download pdf paper with 'paper_id' in working directory.""" # search paper id in arxiv list search = arxiv.Search(id_list=[self.paper_id]) # get paper object paper = next(search.get()) # extract paper title self.paper_title = paper.title # download paper as pdf paper.download_pdf(filename=self.paper_name) # load paper content self.paper_content = extract_text(self.paper_name)
def test_sleep_elapsed(self, patched_time_sleep): client = arxiv.Client(page_size=1) url = client._format_url(arxiv.Search(query="quantum"), 0, 1) # If _last_request_dt is less than delay_seconds ago, sleep. client._last_request_dt = (datetime.now() - timedelta(seconds=client.delay_seconds - 1)) client._parse_feed(url) patched_time_sleep.assert_called_once() patched_time_sleep.reset_mock() # If _last_request_dt is at least delay_seconds ago, don't sleep. client._last_request_dt = (datetime.now() - timedelta(seconds=client.delay_seconds)) client._parse_feed(url) patched_time_sleep.assert_not_called()
def query_recent(category): """ Query the arxiv for the updates of the last day for a given category """ results = arxiv.Search( query=category, max_results=75, sort_by=arxiv.SortCriterion.LastUpdatedDate ).results() elements = [] for _, element in enumerate(results): time_s = element.updated if not is_today(time_s): break elements.append(element) return elements
def main(): parser = argparse.ArgumentParser(description='Paper to transfer to Remarkable') parser.add_argument('paper', type=str, help='ArVix code or path to file') ARGS = parser.parse_args() path = ARGS.paper if is_arxiv_code(ARGS.paper): paper = list(arxiv.Search(id_list=[ARGS.paper]).results())[0] name = to_slug(paper) paper.download_pdf(dirpath="/tmp/", filename=name+'.pdf') print("Adding " + paper.title + " to Remarkable") path = "/tmp/" + to_slug(paper) + ".pdf" call(["rmapi", "put", path])
def arxiv_query_info(arxiv_id_raw): """ Returns extra information about the queried paper """ arxiv_id = url_to_id(arxiv_id_raw) paper = next(arxiv.Search(id_list=[arxiv_id]).results()) title = paper.title authors = [str(i) for i in paper.authors] abstract = paper.summary.replace("\n", " ") msg = f""" > {arxiv_id} Title: {paper.title} Authors: {authors} Abstract: {abstract} """ return msg
def parse_arxiv(command): """ Hacky way to parse out an an arxiv ID from a sentence """ links = re.findall(ARXIV_REGEX, command) arxiv_ids = [] for link in links: print(link) if 'arxiv' not in link: continue arxiv_id = link.split('/')[-1] arxiv_id = arxiv_id.split('.pdf')[0] arxiv_ids.append(arxiv_id) articles = [] if len(arxiv_ids) > 0: articles = list(arxiv.Search(id_list=arxiv_ids).results()) return articles
def get_arxiv_id(paper_title: str, feel_lucky: bool = True): # arxiv query by paper title is shitty # we use google search to get the arxiv_id arxiv_url = list(search(f'{paper_title} site:arxiv.org', stop=1))[0] arxiv_id = re.findall(r'\d+\.\d+', arxiv_url)[0] # papers = arxiv.query(query=paper_title) paper = arxiv.Search(id_list=[arxiv_id])[0] if not feel_lucky: print(paper_title) print(paper['title']) if input('Should we continue') == 'n': return None # TODO assert paper_titile match paper. # for example http://arxiv.org/pdf/1911.05722v3 ->1911.05722v3 return paper['pdf_url'].split('/')[-1]
def build_content(query, query_config): domains = query['domains'] keywords = query['keywords'] total_mail = len(query['keywords']) subject_placeholder = 'arXiv newsletter ' + str(today) + ' {index}/' + str( total_mail) content_placeholder = '\n' + '*' * 35 + '\n ' + subject_placeholder + ' \n' + '*' * 35 + '\n' entry_placeholder = '{index}. {title}\n{authors}\nPublished at: {publish}\nUpdated at: {update}\nPrimary Category: {primary_category}\nCategories: {categories}\n{notes}\n{link}\n\nAbstract:\n{abstract}\n' messages = [] for i, keyword in enumerate(keywords): query = build_query(domains, keyword) print(query) while True: try: results = arxiv.Search(query=query, **query_config) break except: pass entries = '' for j, result in enumerate(results.results()): entry = entry_placeholder.format( index=j + 1, title=result.title, authors=', '.join([author.name for author in result.authors]), publish=result.published, update=result.updated, primary_category=result.primary_category, categories=', '.join(result.categories), link='\n'.join([link.href for link in result.links]), abstract=result.summary, notes=f'Comments: {result.comment}\n' if result.comment is not None else '') entries += entry + '\n' subject = subject_placeholder.format(index=i + 1) content = content_placeholder.format(index=i + 1) content += '\nQuery: ' + keyword + '\n\n' + entries # content = textwrap.wrap(content, width=80, replace_whitespace=False) # content = '\n'.join(content) messages.append((subject, content)) return messages
def get_arxiv_papers( query: str, fields: List = [ "title", "authors", "date", "abstract", "journal", "doi" ], max_results: int = 99999, client_options: Dict = {"num_retries": 10}, search_options: Dict = dict(), ): """ Performs arxiv API request of a given query and returns list of papers with fields as desired. Args: query (str): Query to arxiv API. Needs to match the arxiv API notation. fields (List[str]): List of strings with fields to keep in output. max_results (int): Maximal number of results, defaults to 99999. client_options (Dict): Optional arguments for `arxiv.Client`. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'. search_options (Dict): Optional arguments for `arxiv.Search`. E.g.: id_list (List), sort_by, or sort_order. Returns: list of dicts. One dict per paper. """ client = arxiv.Client(**client_options) search = arxiv.Search(query=query, max_results=max_results, **search_options) results = client.results(search) processed = [{ arxiv_field_mapper.get(key, key): process_fields.get(arxiv_field_mapper.get(key, key), lambda x: x)(value) for key, value in vars(paper).items() if arxiv_field_mapper.get(key, key) in fields } for paper in results] return processed
def find_download_pdf(pdf_name, data): """Given a name of a pdf, downloads the pdf :param pdf_name: name of the pdf to download which contains to claim :type pdf_name: str :param data: arxiv dataset which contains the details of all pdfs and their authors, links etc :type data: pandas dataframe :return: all the content/text found in the pdf :rtype: str """ #SSL Certificate to download pdf from link try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: # Legacy Python that doesn't verify HTTPS certificates by default pass else: # Handle target environment that doesn't support HTTPS verification ssl._create_default_https_context = _create_unverified_https_context df1 = data[data['title'].str.contains(pdf_name)] reference_id = {"id": df1.iloc[0, 1]} reference_get = next(arxiv.Search(id_list=[reference_id['id']]).get()) paper_reference = reference_get.download_pdf() ref_text = textract.process(paper_reference, method='pdfminer') #Convert text type from bytes to string ref_text = ref_text.decode("utf-8") ref_text = ref_text.replace('\x0c', '') return ref_text
def test_max_results(self): client = arxiv.Client(page_size=10, delay_seconds=0) search = arxiv.Search(query="testing", max_results=2) results = [r for r in client.get(search)] self.assertEqual(len(results), 2)
def crawler(query, sort_by, sort_order, page_size, subjectcategory, max_results=float('inf')): # 参数处理 query = json.loads(query) subjectcategory = json.loads(subjectcategory) max_results = int(max_results) if isinstance(max_results, str) else max_results # client配置,每5秒一个API请求,出错重试5次 client = arxiv.Client( page_size=int(page_size), delay_seconds=5, num_retries=5 ) for subject, key_words in query.items(): query_results = defaultdict(list) db_set, arxiv_db_path = load_set(subject) # 每个关键字一个查询请求 for key_word in key_words: search = arxiv.Search( query=key_word, max_results=max_results, sort_by=sort_by_dict[sort_by], sort_order=sort_order_dict[sort_order] ) try: for result in client.get(search): # 是否在指定的类别内 for cate in result.categories: if cate in subjectcategory: break else: continue # 数据库中是否已存在 short_id = result.get_short_id() if short_id in db_set: continue db_set.add(short_id) year = result.updated.tm_year ori = dict() ori['title'] = result.title ori['authors'] = [author.name for author in result.authors] ori['updated_sorted'] = result.updated # ori['published'] = time.strftime('%Y-%m-%d %H:%M:%S', result.published) ori['updated'] = time.strftime('%Y-%m-%d %H:%M:%S', result.updated) ori['summary'] = result.summary.replace('\n', ' ') # ori['comment'] = result.comment # ori['primary_category'] = result.primary_category # ori['categories'] = result.categories ori['pdf_url'] = result.get_pdf_url() ori['short_id'] = result.get_short_id() query_results[year].append(ori) except arxiv.UnexpectedEmptyPageError: print(f"{subject}--{key_word}: arxiv.UnexpectedEmptyPageError") except arxiv.HTTPError: print(f"{subject}--{key_word}: arxiv.HTTPError") except Exception as error: print(f"{subject}--{key_word}: {error}") # 解析存储结果 for year, results in query_results.items(): markdown_fp = os.path.join(arxiv_db_path, f'{year}.md') if os.path.exists(markdown_fp): old_results = load_markdown(markdown_fp) query_set = set([item['short_id'] for item in old_results]) for item in results: if item['short_id'] not in query_set: old_results.append(item) results = old_results results = sorted(results, key=lambda item: item['updated_sorted']) markdown = [] markdown.append(f"# {year}\n") toc = [] content = defaultdict(list) for result in results: ym = result['updated'].rsplit('-', 1)[0] if ym not in toc: toc.append(ym) paper = f"<details>\n\n<summary>{result['updated']} - {result['title']}</summary>\n\n" \ f"- *{', '.join(result['authors'])}*\n\n" \ f"- `{result['short_id']}` - [abs](http://arxiv.org/abs/{result['short_id']}) - [pdf]({result['pdf_url']})\n\n" \ f"> {result['summary']}\n\n" \ f"</details>\n\n" content[ym].append(paper) markdown.append("## TOC\n") toc = sorted(toc) markdown.append("\n".join([f"- [{t}](#{t})" for t in toc])+'\n') for ym, papers in content.items(): markdown.append(f"## {ym}\n") markdown.append("".join(papers)) with open(markdown_fp, "w", encoding='utf-8') as f: f.write("\n".join(markdown)) if len(query_results) > 0: with open(os.path.join(arxiv_db_path, 'db.txt'), "w") as f: db_str = json.dumps(list(db_set)) f.write(db_str)
def set_metadata(filename: str, title: str, author: str): args = [ "exiftool", filename, "-overwrite_original", f"-Author={author}", f"-Title={title}", ] subprocess.run(args, capture_output=True, check=True) lines = sys.stdin.readlines() paper_ids = [parse_line(line.strip()) for line in lines] paper_ids = [x for x in paper_ids if x is not None] papers = arxiv.Search(id_list=paper_ids).results() for paper, paper_id in zip(papers, paper_ids): src_filename = f"{paper_id}.pdf" dst_filename = paper_to_filename(paper) if os.path.exists(src_filename): print(f"[Rename] {src_filename}") os.rename(src_filename, dst_filename) else: print("[Download]") paper.download_pdf(filename=dst_filename) print(f"file: {dst_filename}") print(f"url: {paper.entry_id}") print(f"authors: {[str(x) for x in paper.authors]}") print(f"title: {paper.title}\n") set_metadata(
def test_sleep_zero_delay(self, patched_time_sleep): client = arxiv.Client(page_size=1, delay_seconds=0) url = client._format_url(arxiv.Search(query="quantum"), 0, 1) client._parse_feed(url) client._parse_feed(url) patched_time_sleep.assert_not_called()
def test_invalid_id(self): results = list(arxiv.Search(id_list=["0000.0000"]).get()) self.assertEqual(len(results), 0)
def test_no_duplicates(self): search = arxiv.Search("testing", max_results=100) ids = set() for r in search.get(): self.assertFalse(r.entry_id in ids) ids.add(r.entry_id)
def broken_get(): search = arxiv.Search(query="quantum") return next(broken_client.get(search))