def test_raises_appropriate_exception_if_other_non_ok_response(): with patch("common.fetch_arxiv.save_source_archive") as _: with patch("common.fetch_arxiv.requests") as mock_requests: mock_resp = Mock() mock_resp.ok = False mock_resp.status_code = 403 mock_requests.get.return_value = mock_resp with pytest.raises(FetchFromArxivException): fetch_from_arxiv("fakeid")
def process(self, item: ArxivId) -> Iterator[None]: if self.args.source == "arxiv": fetch_from_arxiv(item) # This method of delaying fetches assumes that calls to 'process' will be made sequentially # and not in parallel. Delay mechanisms will need to be more sophisticated if we transition # to parallel data fetching. time.sleep(FETCH_DELAY) yield None elif self.args.source == "s3": fetch_from_s3(item, self.args.s3_bucket) yield None
def test_saves_source_if_all_good(): with patch("common.fetch_arxiv.save_source_archive") as mock_save_source: with patch("common.fetch_arxiv.requests") as mock_requests: mock_resp = Mock() mock_resp.ok = True mock_resp.status_code = 200 mock_resp.content = "i am some content" mock_requests.get.return_value = mock_resp fetch_from_arxiv("fakeid") mock_save_source.assert_called_with("fakeid", "i am some content", None)
def test_raises_fetch_exception_if_content_is_pdf(): with patch("common.fetch_arxiv.save_source_archive") as _: with patch("common.fetch_arxiv.requests") as mock_requests: mock_resp = Mock() mock_resp.ok = True mock_resp.status_code = 200 mock_resp.content = "some pdf content" mock_resp.headers = {"Content-Type": "application/pdf"} mock_requests.get.return_value = mock_resp with pytest.raises(FetchFromArxivException): fetch_from_arxiv("fakeid")
def test_raises_regular_error_in_case_of_404(): with patch("common.fetch_arxiv.save_source_archive") as _: with patch("common.fetch_arxiv.requests") as mock_requests: mock_resp = Mock() mock_resp.ok = False mock_resp.status_code = 404 mock_requests.get.return_value = mock_resp try: fetch_from_arxiv("fakeid") except Exception as e: assert not isinstance(e, FetchFromArxivException) else: assert False, "Expected to receive an exception"
def test_raises_regular_exception_if_content_is_not_pdf_nor_tarball(): with patch("common.fetch_arxiv.save_source_archive") as _: with patch("common.fetch_arxiv.requests") as mock_requests: mock_resp = Mock() mock_resp.ok = True mock_resp.status_code = 200 mock_resp.content = "some text content" mock_resp.headers = {"Content-Type": "application/text"} mock_requests.get.return_value = mock_resp try: fetch_from_arxiv("fakeid") except Exception as e: assert not isinstance(e, FetchFromArxivException) else: assert False, "Expected to receive an exception"
def process(self, item: ArxivId) -> Iterator[None]: if self.args.source == "arxiv": attempt = 0 while True: try: result = fetch_from_arxiv(item) yield result break except FetchFromArxivException as e: if attempt < MAX_FETCH_ATTEMPTS - 1: logger.warning( "Trouble getting data from ArXiv. Backing off and trying again." ) attempt += 1 time.sleep(BACKOFF_FETCH_DELAY) else: logger.warning("Exceed maximum retries to ArXiv.") time.sleep(BACKOFF_FETCH_DELAY) raise e # This method of delaying fetches assumes that calls to 'process' will be made sequentially # and not in parallel. Delay mechanisms will need to be more sophisticated if we transition # to parallel data fetching. time.sleep(DEFAULT_FETCH_DELAY) yield None elif self.args.source == "s3": fetch_from_s3(item, self.args.s3_bucket) yield None
def test_raises_appropriate_exception_if_request_fails_outright(): with patch("common.fetch_arxiv.save_source_archive") as mock_save_source: with patch("common.fetch_arxiv.requests") as mock_requests: mock_requests.get.side_effect = urllib3.exceptions.ProtocolError() with pytest.raises(FetchFromArxivException): fetch_from_arxiv("fakeid") assert not mock_save_source.called with patch("common.fetch_arxiv.requests") as mock_requests: mock_requests.get.side_effect = requests.exceptions.HTTPError() with pytest.raises(FetchFromArxivException): fetch_from_arxiv("fakeid") assert not mock_save_source.called
default="tmp", ) args = parser.parse_args() arxiv_id = args.arxiv_id output_dir = args.output_dir archives_dir = os.path.join(output_dir, "archives") archive_path = os.path.join(archives_dir, directories.escape_slashes(arxiv_id)) sources_dir = os.path.join(output_dir, directories.escape_slashes(arxiv_id)) if not os.path.exists(archives_dir): print(f"Creating directory to hold source archives at {archives_dir}.") os.makedirs(archives_dir) print( f"Downloading archive of source files from arXiv for paper {arxiv_id}...", end="", ) fetch_from_arxiv(arxiv_id, dest=archive_path) print("done.") if not os.path.exists(sources_dir): print(f"Creating directory to hold unpacked sources at {sources_dir}.") os.makedirs(sources_dir) print(f"Unpacking sources for paper {arxiv_id} into {sources_dir}.") unpack_archive(archive_path, sources_dir)