Esempio n. 1
0
def test_raises_appropriate_exception_if_other_non_ok_response():
    with patch("common.fetch_arxiv.save_source_archive") as _:
        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_resp = Mock()
            mock_resp.ok = False
            mock_resp.status_code = 403
            mock_requests.get.return_value = mock_resp

            with pytest.raises(FetchFromArxivException):
                fetch_from_arxiv("fakeid")
Esempio n. 2
0
 def process(self, item: ArxivId) -> Iterator[None]:
     if self.args.source == "arxiv":
         fetch_from_arxiv(item)
         # This method of delaying fetches assumes that calls to 'process' will be made sequentially
         # and not in parallel. Delay mechanisms will need to be more sophisticated if we transition
         # to parallel data fetching.
         time.sleep(FETCH_DELAY)
         yield None
     elif self.args.source == "s3":
         fetch_from_s3(item, self.args.s3_bucket)
         yield None
Esempio n. 3
0
def test_saves_source_if_all_good():
    with patch("common.fetch_arxiv.save_source_archive") as mock_save_source:
        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_resp = Mock()
            mock_resp.ok = True
            mock_resp.status_code = 200
            mock_resp.content = "i am some content"
            mock_requests.get.return_value = mock_resp

            fetch_from_arxiv("fakeid")

            mock_save_source.assert_called_with("fakeid", "i am some content", None)
Esempio n. 4
0
def test_raises_fetch_exception_if_content_is_pdf():
    with patch("common.fetch_arxiv.save_source_archive") as _:
        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_resp = Mock()
            mock_resp.ok = True
            mock_resp.status_code = 200
            mock_resp.content = "some pdf content"
            mock_resp.headers = {"Content-Type": "application/pdf"}
            mock_requests.get.return_value = mock_resp

            with pytest.raises(FetchFromArxivException):
                fetch_from_arxiv("fakeid")
Esempio n. 5
0
def test_raises_regular_error_in_case_of_404():
    with patch("common.fetch_arxiv.save_source_archive") as _:
        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_resp = Mock()
            mock_resp.ok = False
            mock_resp.status_code = 404
            mock_requests.get.return_value = mock_resp

            try:
                fetch_from_arxiv("fakeid")
            except Exception as e:
                assert not isinstance(e, FetchFromArxivException)
            else:
                assert False, "Expected to receive an exception"
Esempio n. 6
0
def test_raises_regular_exception_if_content_is_not_pdf_nor_tarball():
    with patch("common.fetch_arxiv.save_source_archive") as _:
        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_resp = Mock()
            mock_resp.ok = True
            mock_resp.status_code = 200
            mock_resp.content = "some text content"
            mock_resp.headers = {"Content-Type": "application/text"}
            mock_requests.get.return_value = mock_resp

            try:
                fetch_from_arxiv("fakeid")
            except Exception as e:
                assert not isinstance(e, FetchFromArxivException)
            else:
                assert False, "Expected to receive an exception"
Esempio n. 7
0
    def process(self, item: ArxivId) -> Iterator[None]:
        if self.args.source == "arxiv":
            attempt = 0

            while True:
                try:
                    result = fetch_from_arxiv(item)
                    yield result
                    break
                except FetchFromArxivException as e:
                    if attempt < MAX_FETCH_ATTEMPTS - 1:
                        logger.warning(
                            "Trouble getting data from ArXiv. Backing off and trying again."
                        )
                        attempt += 1
                        time.sleep(BACKOFF_FETCH_DELAY)
                    else:
                        logger.warning("Exceed maximum retries to ArXiv.")
                        time.sleep(BACKOFF_FETCH_DELAY)
                        raise e

            # This method of delaying fetches assumes that calls to 'process' will be made sequentially
            # and not in parallel. Delay mechanisms will need to be more sophisticated if we transition
            # to parallel data fetching.
            time.sleep(DEFAULT_FETCH_DELAY)
            yield None

        elif self.args.source == "s3":
            fetch_from_s3(item, self.args.s3_bucket)
            yield None
Esempio n. 8
0
def test_raises_appropriate_exception_if_request_fails_outright():
    with patch("common.fetch_arxiv.save_source_archive") as mock_save_source:
        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_requests.get.side_effect = urllib3.exceptions.ProtocolError()

            with pytest.raises(FetchFromArxivException):
                fetch_from_arxiv("fakeid")

            assert not mock_save_source.called

        with patch("common.fetch_arxiv.requests") as mock_requests:
            mock_requests.get.side_effect = requests.exceptions.HTTPError()

            with pytest.raises(FetchFromArxivException):
                fetch_from_arxiv("fakeid")

            assert not mock_save_source.called
Esempio n. 9
0
        default="tmp",
    )

    args = parser.parse_args()
    arxiv_id = args.arxiv_id

    output_dir = args.output_dir
    archives_dir = os.path.join(output_dir, "archives")
    archive_path = os.path.join(archives_dir,
                                directories.escape_slashes(arxiv_id))
    sources_dir = os.path.join(output_dir,
                               directories.escape_slashes(arxiv_id))

    if not os.path.exists(archives_dir):
        print(f"Creating directory to hold source archives at {archives_dir}.")
        os.makedirs(archives_dir)

    print(
        f"Downloading archive of source files from arXiv for paper {arxiv_id}...",
        end="",
    )
    fetch_from_arxiv(arxiv_id, dest=archive_path)
    print("done.")

    if not os.path.exists(sources_dir):
        print(f"Creating directory to hold unpacked sources at {sources_dir}.")
        os.makedirs(sources_dir)

    print(f"Unpacking sources for paper {arxiv_id} into {sources_dir}.")
    unpack_archive(archive_path, sources_dir)