def test_from_feed_entry(self): feed = arxiv.Client()._parse_feed( "http://export.arxiv.org/api/query?search_query=testing" ) feed_entry = feed.entries[0] result = arxiv.Result._from_feed_entry(feed_entry) self.assert_valid_result(result)
def test_query_page_count(self): client = arxiv.Client(page_size=10, delay_seconds=0) client._parse_feed = MagicMock(wraps=client._parse_feed) generator = client.get(arxiv.Search(query="testing", max_results=55)) results = [r for r in generator] self.assertEqual(len(results), 55) self.assertEqual(client._parse_feed.call_count, 6)
def atom(all): logging.info("Got Atom feed request: {all}".format(all=all), extra={"query": all}) # Creates a search object to avoid duplicating URL-generation logic. search = to_search(all) arxiv_url = arxiv.Client()._format_url(search, 0, MAX_RESULTS) # 301: permanent redirect to the arXiv-hosted URL. redirect(arxiv_url, 301)
def test_sleep_multiple_requests(self, patched_time_sleep): client = arxiv.Client(page_size=1) url1 = client._format_url(arxiv.Search(query="quantum"), 0, 1) url2 = client._format_url(arxiv.Search(query="testing"), 0, 1) # Rate limiting is URL-independent; expect same behavior as in # `test_sleep_standard`. client._parse_feed(url1) patched_time_sleep.assert_not_called() client._last_request_dt = datetime.now() client._parse_feed(url2) patched_time_sleep.assert_called_once_with( approx(client.delay_seconds, rel=1e-3))
def test_sleep_standard(self, patched_time_sleep): client = arxiv.Client(page_size=1) url = client._format_url(arxiv.Search(query="quantum"), 0, 1) # A client should sleep until delay_seconds have passed. client._parse_feed(url) patched_time_sleep.assert_not_called() # Overwrite _last_request_dt to minimize flakiness: different # environments will have different page fetch times. client._last_request_dt = datetime.now() client._parse_feed(url) patched_time_sleep.assert_called_once_with( approx(client.delay_seconds, rel=1e-3))
def test_sleep_elapsed(self, patched_time_sleep): client = arxiv.Client(page_size=1) url = client._format_url(arxiv.Search(query="quantum"), 0, 1) # If _last_request_dt is less than delay_seconds ago, sleep. client._last_request_dt = (datetime.now() - timedelta(seconds=client.delay_seconds - 1)) client._parse_feed(url) patched_time_sleep.assert_called_once() patched_time_sleep.reset_mock() # If _last_request_dt is at least delay_seconds ago, don't sleep. client._last_request_dt = (datetime.now() - timedelta(seconds=client.delay_seconds)) client._parse_feed(url) patched_time_sleep.assert_not_called()
def test_retry(self): # broken_client always encounters a 500 status. broken_client = arxiv.Client(page_size=1, delay_seconds=0) broken_client.query_url_format = "https://httpstat.us/500?{}" def broken_get(): search = arxiv.Search(query="quantum") return next(broken_client.get(search)) self.assertRaises(arxiv.HTTPError, broken_get) for num_retries in [2, 5]: broken_client.num_retries = num_retries try: broken_get() except arxiv.HTTPError as e: self.assertEqual(e.retry, broken_client.num_retries - 1)
def test_retry(self): num_retries = 2 broken_client = arxiv.Client(num_retries=num_retries) # Always returns a 500 status. broken_client.query_url_format = "https://httpstat.us/500?{}" def broken_get(): search = arxiv.Search(query="quantum") return next(broken_client.get(search)) self.assertRaises(arxiv.HTTPError, broken_get) for num_retries in [2, 5]: broken_client.num_retries = num_retries try: broken_get() except arxiv.HTTPError as e: self.assertEqual(e.retry, broken_client.num_retries - 1)
def get_arxiv_papers( query: str, fields: List = [ "title", "authors", "date", "abstract", "journal", "doi" ], max_results: int = 99999, client_options: Dict = {"num_retries": 10}, search_options: Dict = dict(), ): """ Performs arxiv API request of a given query and returns list of papers with fields as desired. Args: query (str): Query to arxiv API. Needs to match the arxiv API notation. fields (List[str]): List of strings with fields to keep in output. max_results (int): Maximal number of results, defaults to 99999. client_options (Dict): Optional arguments for `arxiv.Client`. E.g.: page_size (int), delay_seconds (int), num_retries (int). NOTE: Decreasing 'num_retries' will speed up processing but might result in more frequent 'UnexpectedEmptyPageErrors'. search_options (Dict): Optional arguments for `arxiv.Search`. E.g.: id_list (List), sort_by, or sort_order. Returns: list of dicts. One dict per paper. """ client = arxiv.Client(**client_options) search = arxiv.Search(query=query, max_results=max_results, **search_options) results = client.results(search) processed = [{ arxiv_field_mapper.get(key, key): process_fields.get(arxiv_field_mapper.get(key, key), lambda x: x)(value) for key, value in vars(paper).items() if arxiv_field_mapper.get(key, key) in fields } for paper in results] return processed
def crawler(query, sort_by, sort_order, page_size, subjectcategory, max_results=float('inf')): # 参数处理 query = json.loads(query) subjectcategory = json.loads(subjectcategory) max_results = int(max_results) if isinstance(max_results, str) else max_results # client配置,每5秒一个API请求,出错重试5次 client = arxiv.Client( page_size=int(page_size), delay_seconds=5, num_retries=5 ) for subject, key_words in query.items(): query_results = defaultdict(list) db_set, arxiv_db_path = load_set(subject) # 每个关键字一个查询请求 for key_word in key_words: search = arxiv.Search( query=key_word, max_results=max_results, sort_by=sort_by_dict[sort_by], sort_order=sort_order_dict[sort_order] ) try: for result in client.get(search): # 是否在指定的类别内 for cate in result.categories: if cate in subjectcategory: break else: continue # 数据库中是否已存在 short_id = result.get_short_id() if short_id in db_set: continue db_set.add(short_id) year = result.updated.tm_year ori = dict() ori['title'] = result.title ori['authors'] = [author.name for author in result.authors] ori['updated_sorted'] = result.updated # ori['published'] = time.strftime('%Y-%m-%d %H:%M:%S', result.published) ori['updated'] = time.strftime('%Y-%m-%d %H:%M:%S', result.updated) ori['summary'] = result.summary.replace('\n', ' ') # ori['comment'] = result.comment # ori['primary_category'] = result.primary_category # ori['categories'] = result.categories ori['pdf_url'] = result.get_pdf_url() ori['short_id'] = result.get_short_id() query_results[year].append(ori) except arxiv.UnexpectedEmptyPageError: print(f"{subject}--{key_word}: arxiv.UnexpectedEmptyPageError") except arxiv.HTTPError: print(f"{subject}--{key_word}: arxiv.HTTPError") except Exception as error: print(f"{subject}--{key_word}: {error}") # 解析存储结果 for year, results in query_results.items(): markdown_fp = os.path.join(arxiv_db_path, f'{year}.md') if os.path.exists(markdown_fp): old_results = load_markdown(markdown_fp) query_set = set([item['short_id'] for item in old_results]) for item in results: if item['short_id'] not in query_set: old_results.append(item) results = old_results results = sorted(results, key=lambda item: item['updated_sorted']) markdown = [] markdown.append(f"# {year}\n") toc = [] content = defaultdict(list) for result in results: ym = result['updated'].rsplit('-', 1)[0] if ym not in toc: toc.append(ym) paper = f"<details>\n\n<summary>{result['updated']} - {result['title']}</summary>\n\n" \ f"- *{', '.join(result['authors'])}*\n\n" \ f"- `{result['short_id']}` - [abs](http://arxiv.org/abs/{result['short_id']}) - [pdf]({result['pdf_url']})\n\n" \ f"> {result['summary']}\n\n" \ f"</details>\n\n" content[ym].append(paper) markdown.append("## TOC\n") toc = sorted(toc) markdown.append("\n".join([f"- [{t}](#{t})" for t in toc])+'\n') for ym, papers in content.items(): markdown.append(f"## {ym}\n") markdown.append("".join(papers)) with open(markdown_fp, "w", encoding='utf-8') as f: f.write("\n".join(markdown)) if len(query_results) > 0: with open(os.path.join(arxiv_db_path, 'db.txt'), "w") as f: db_str = json.dumps(list(db_set)) f.write(db_str)
def test_sleep_zero_delay(self, patched_time_sleep): client = arxiv.Client(page_size=1, delay_seconds=0) url = client._format_url(arxiv.Search(query="quantum"), 0, 1) client._parse_feed(url) client._parse_feed(url) patched_time_sleep.assert_not_called()
def test_max_results(self): client = arxiv.Client(page_size=10, delay_seconds=0) search = arxiv.Search(query="testing", max_results=2) results = [r for r in client.get(search)] self.assertEqual(len(results), 2)
def __init__(self, s): self.search_query = s self._client = arxiv.Client()