Esempio n. 1
0
 def test_from_feed_entry(self):
     feed = arxiv.Client()._parse_feed(
         "http://export.arxiv.org/api/query?search_query=testing"
     )
     feed_entry = feed.entries[0]
     result = arxiv.Result._from_feed_entry(feed_entry)
     self.assert_valid_result(result)
Esempio n. 2
0
 def test_query_page_count(self):
     client = arxiv.Client(page_size=10, delay_seconds=0)
     client._parse_feed = MagicMock(wraps=client._parse_feed)
     generator = client.get(arxiv.Search(query="testing", max_results=55))
     results = [r for r in generator]
     self.assertEqual(len(results), 55)
     self.assertEqual(client._parse_feed.call_count, 6)
Esempio n. 3
0
def atom(all):
    logging.info("Got Atom feed request: {all}".format(all=all),
                 extra={"query": all})
    # Creates a search object to avoid duplicating URL-generation logic.
    search = to_search(all)
    arxiv_url = arxiv.Client()._format_url(search, 0, MAX_RESULTS)
    # 301: permanent redirect to the arXiv-hosted URL.
    redirect(arxiv_url, 301)
Esempio n. 4
0
 def test_sleep_multiple_requests(self, patched_time_sleep):
     client = arxiv.Client(page_size=1)
     url1 = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     url2 = client._format_url(arxiv.Search(query="testing"), 0, 1)
     # Rate limiting is URL-independent; expect same behavior as in
     # `test_sleep_standard`.
     client._parse_feed(url1)
     patched_time_sleep.assert_not_called()
     client._last_request_dt = datetime.now()
     client._parse_feed(url2)
     patched_time_sleep.assert_called_once_with(
         approx(client.delay_seconds, rel=1e-3))
Esempio n. 5
0
 def test_sleep_standard(self, patched_time_sleep):
     client = arxiv.Client(page_size=1)
     url = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     # A client should sleep until delay_seconds have passed.
     client._parse_feed(url)
     patched_time_sleep.assert_not_called()
     # Overwrite _last_request_dt to minimize flakiness: different
     # environments will have different page fetch times.
     client._last_request_dt = datetime.now()
     client._parse_feed(url)
     patched_time_sleep.assert_called_once_with(
         approx(client.delay_seconds, rel=1e-3))
Esempio n. 6
0
 def test_sleep_elapsed(self, patched_time_sleep):
     client = arxiv.Client(page_size=1)
     url = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     # If _last_request_dt is less than delay_seconds ago, sleep.
     client._last_request_dt = (datetime.now() -
                                timedelta(seconds=client.delay_seconds - 1))
     client._parse_feed(url)
     patched_time_sleep.assert_called_once()
     patched_time_sleep.reset_mock()
     # If _last_request_dt is at least delay_seconds ago, don't sleep.
     client._last_request_dt = (datetime.now() -
                                timedelta(seconds=client.delay_seconds))
     client._parse_feed(url)
     patched_time_sleep.assert_not_called()
Esempio n. 7
0
    def test_retry(self):
        # broken_client always encounters a 500 status.
        broken_client = arxiv.Client(page_size=1, delay_seconds=0)
        broken_client.query_url_format = "https://httpstat.us/500?{}"

        def broken_get():
            search = arxiv.Search(query="quantum")
            return next(broken_client.get(search))

        self.assertRaises(arxiv.HTTPError, broken_get)
        for num_retries in [2, 5]:
            broken_client.num_retries = num_retries
            try:
                broken_get()
            except arxiv.HTTPError as e:
                self.assertEqual(e.retry, broken_client.num_retries - 1)
Esempio n. 8
0
    def test_retry(self):
        num_retries = 2
        broken_client = arxiv.Client(num_retries=num_retries)
        # Always returns a 500 status.
        broken_client.query_url_format = "https://httpstat.us/500?{}"

        def broken_get():
            search = arxiv.Search(query="quantum")
            return next(broken_client.get(search))

        self.assertRaises(arxiv.HTTPError, broken_get)
        for num_retries in [2, 5]:
            broken_client.num_retries = num_retries
            try:
                broken_get()
            except arxiv.HTTPError as e:
                self.assertEqual(e.retry, broken_client.num_retries - 1)
def get_arxiv_papers(
        query: str,
        fields: List = [
            "title", "authors", "date", "abstract", "journal", "doi"
        ],
        max_results: int = 99999,
        client_options: Dict = {"num_retries": 10},
        search_options: Dict = dict(),
):
    """
    Performs arxiv API request of a given query and returns list of papers with
    fields as desired.

    Args:
        query (str): Query to arxiv API. Needs to match the arxiv API notation.
        fields (List[str]): List of strings with fields to keep in output.
        max_results (int): Maximal number of results, defaults to 99999.
        client_options (Dict): Optional arguments for `arxiv.Client`. E.g.:
            page_size (int), delay_seconds (int), num_retries (int).
            NOTE: Decreasing 'num_retries' will speed up processing but might
            result in more frequent 'UnexpectedEmptyPageErrors'.
        search_options (Dict): Optional arguments for `arxiv.Search`. E.g.:
            id_list (List), sort_by, or sort_order.

    Returns:
        list of dicts. One dict per paper.

    """
    client = arxiv.Client(**client_options)
    search = arxiv.Search(query=query,
                          max_results=max_results,
                          **search_options)
    results = client.results(search)

    processed = [{
        arxiv_field_mapper.get(key, key):
        process_fields.get(arxiv_field_mapper.get(key, key),
                           lambda x: x)(value)
        for key, value in vars(paper).items()
        if arxiv_field_mapper.get(key, key) in fields
    } for paper in results]
    return processed
Esempio n. 10
0
def crawler(query,
            sort_by,
            sort_order,
            page_size,
            subjectcategory,
            max_results=float('inf')):
    # 参数处理
    query = json.loads(query)
    subjectcategory = json.loads(subjectcategory)
    max_results = int(max_results) if isinstance(max_results, str) else max_results

    # client配置,每5秒一个API请求,出错重试5次
    client = arxiv.Client(
        page_size=int(page_size),
        delay_seconds=5,
        num_retries=5
    )

    for subject, key_words in query.items():
        query_results = defaultdict(list)
        db_set, arxiv_db_path = load_set(subject)

        # 每个关键字一个查询请求
        for key_word in key_words:
            search = arxiv.Search(
                query=key_word,
                max_results=max_results,
                sort_by=sort_by_dict[sort_by],
                sort_order=sort_order_dict[sort_order]
            )

            try:
                for result in client.get(search):
                    # 是否在指定的类别内
                    for cate in result.categories:
                        if cate in subjectcategory:
                            break
                    else:
                        continue

                    # 数据库中是否已存在
                    short_id = result.get_short_id()
                    if short_id in db_set:
                        continue
                    db_set.add(short_id)

                    year = result.updated.tm_year
                    ori = dict()
                    ori['title'] = result.title
                    ori['authors'] = [author.name for author in result.authors]
                    ori['updated_sorted'] = result.updated
                    # ori['published'] = time.strftime('%Y-%m-%d %H:%M:%S', result.published)
                    ori['updated'] = time.strftime('%Y-%m-%d %H:%M:%S', result.updated)
                    ori['summary'] = result.summary.replace('\n', ' ')
                    # ori['comment'] = result.comment
                    # ori['primary_category'] = result.primary_category
                    # ori['categories'] = result.categories
                    ori['pdf_url'] = result.get_pdf_url()
                    ori['short_id'] = result.get_short_id()
                    query_results[year].append(ori)
            except arxiv.UnexpectedEmptyPageError:
                print(f"{subject}--{key_word}: arxiv.UnexpectedEmptyPageError")
            except arxiv.HTTPError:
                print(f"{subject}--{key_word}: arxiv.HTTPError")
            except Exception as error:
                print(f"{subject}--{key_word}: {error}")

        # 解析存储结果
        for year, results in query_results.items():
            markdown_fp = os.path.join(arxiv_db_path, f'{year}.md')
            if os.path.exists(markdown_fp):
                old_results = load_markdown(markdown_fp)
                query_set = set([item['short_id'] for item in old_results])
                for item in results:
                    if item['short_id'] not in query_set:
                        old_results.append(item)
                results = old_results
            results = sorted(results, key=lambda item: item['updated_sorted'])

            markdown = []
            markdown.append(f"# {year}\n")

            toc = []
            content = defaultdict(list)
            for result in results:
                ym = result['updated'].rsplit('-', 1)[0]
                if ym not in toc:
                    toc.append(ym)
                paper = f"<details>\n\n<summary>{result['updated']} - {result['title']}</summary>\n\n" \
                        f"- *{', '.join(result['authors'])}*\n\n" \
                        f"- `{result['short_id']}` - [abs](http://arxiv.org/abs/{result['short_id']}) - [pdf]({result['pdf_url']})\n\n" \
                        f"> {result['summary']}\n\n" \
                        f"</details>\n\n"
                content[ym].append(paper)

            markdown.append("## TOC\n")
            toc = sorted(toc)
            markdown.append("\n".join([f"- [{t}](#{t})" for t in toc])+'\n')

            for ym, papers in content.items():
                markdown.append(f"## {ym}\n")
                markdown.append("".join(papers))

            with open(markdown_fp, "w", encoding='utf-8') as f:
                f.write("\n".join(markdown))

        if len(query_results) > 0:
            with open(os.path.join(arxiv_db_path, 'db.txt'), "w") as f:
                db_str = json.dumps(list(db_set))
                f.write(db_str)
Esempio n. 11
0
 def test_sleep_zero_delay(self, patched_time_sleep):
     client = arxiv.Client(page_size=1, delay_seconds=0)
     url = client._format_url(arxiv.Search(query="quantum"), 0, 1)
     client._parse_feed(url)
     client._parse_feed(url)
     patched_time_sleep.assert_not_called()
Esempio n. 12
0
 def test_max_results(self):
     client = arxiv.Client(page_size=10, delay_seconds=0)
     search = arxiv.Search(query="testing", max_results=2)
     results = [r for r in client.get(search)]
     self.assertEqual(len(results), 2)
Esempio n. 13
0
 def __init__(self, s):
     self.search_query = s
     self._client = arxiv.Client()