def test_download_uses_pre_signed_url(self):
        """Test that an s3 URL uses requests.get, not a session."""
        locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz',
                     'http://loc2/blah.warc.gz']
        filename = 'blah.warc.gz'
        checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'}
        data_file = wc.DataFile(locations, filename, checksums, 123456)
        mock_200 = MockResponse200('')

        with patch('requests.get', return_value=mock_200) as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            wc.download_file(data_file, requests.Session(), filename)

        # Check we attempted one download via requests.get and wrote the file.
        mock_get.assert_called_once_with(locations[0], stream=True)
        mock_write_file.assert_called_once_with(mock_200, filename)
 def test_run_file_already_verified(self):
     """Test a downloaded file is not verified twice."""
     return_data_file = wc.DataFile(self.locations, self.filename, self.checksums, self.size)
     return_data_file.verified = True
     # Create a queue holding two sets of file data.
     get_q = multiprocessing.JoinableQueue()
     for _ in (1, 2):
         get_q.put(self.data_file)
     manager = multiprocessing.Manager()
     result_q = manager.Queue()
     log_q = manager.Queue()
     with patch('wasapi_client.verify_file', return_value=True) as mock_verify, \
             patch('wasapi_client.download_file', return_value=return_data_file):
         p = wc.Downloader(get_q, result_q, log_q)
         p.start()
         p.run()
     # If the join doesn't block, the queue is fully processed.
     get_q.join()
     assert log_q.empty()
     for _ in (1, 2):
         assert result_q.get() == ('success', self.filename)
     assert result_q.empty()
     # Check verify_exists was not called, since it was called in `download_file`.
     assert not mock_verify.called
class TestDownloader:
    locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz']
    filename = 'blah.warc.gz'
    checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403',
                 'md5': '62f87a969af0dd857ecd6c3e7fde6aed'}
    size = 12345678
    data_file = wc.DataFile(locations, filename, checksums, size)

    def test_run(self):
        """Test downloader when downloads are successful."""
        # Create a queue holding two sets of file data.
        get_q = multiprocessing.JoinableQueue()
        for _ in (1, 2):
            get_q.put(self.data_file)
        manager = multiprocessing.Manager()
        result_q = manager.Queue()
        log_q = manager.Queue()
        with patch('wasapi_client.verify_file', return_value=True), \
                patch('wasapi_client.download_file', return_value=self.data_file):
            p = wc.Downloader(get_q, result_q, log_q)
            p.start()
            p.run()
        # If the join doesn't block, the queue is fully processed.
        get_q.join()
        # Verify there is nothing on the log_q.
        assert log_q.empty()
        for _ in (1, 2):
            assert result_q.get() == ('success', self.filename)
        # Verify those were the only two results on the result_q.
        assert result_q.empty()

    @patch('wasapi_client.download_file')
    def test_run_WASAPIDownloadError(self, mock_download):
        """Test downloader when downloads fail."""
        expected_error = 'WD Error'
        mock_download.side_effect = wc.WASAPIDownloadError(expected_error)
        # Create a queue holding two sets of file data.
        get_q = multiprocessing.JoinableQueue()
        for _ in (1, 2):
            get_q.put(self.data_file)
        manager = multiprocessing.Manager()
        result_q = manager.Queue()
        log_q = manager.Queue()
        p = wc.Downloader(get_q, result_q, log_q)
        p.start()
        p.run()
        # If the join doesn't block, the queue is fully processed.
        get_q.join()
        for _ in (1, 2):
            assert log_q.get().msg == expected_error
            assert result_q.get() == ('failure', self.filename)
        # Verify those were the only two results on the result_q.
        # Sometimes `empty` needs a moment to register.
        assert result_q.empty()

    def test_run_file_already_verified(self):
        """Test a downloaded file is not verified twice."""
        return_data_file = wc.DataFile(self.locations, self.filename, self.checksums, self.size)
        return_data_file.verified = True
        # Create a queue holding two sets of file data.
        get_q = multiprocessing.JoinableQueue()
        for _ in (1, 2):
            get_q.put(self.data_file)
        manager = multiprocessing.Manager()
        result_q = manager.Queue()
        log_q = manager.Queue()
        with patch('wasapi_client.verify_file', return_value=True) as mock_verify, \
                patch('wasapi_client.download_file', return_value=return_data_file):
            p = wc.Downloader(get_q, result_q, log_q)
            p.start()
            p.run()
        # If the join doesn't block, the queue is fully processed.
        get_q.join()
        assert log_q.empty()
        for _ in (1, 2):
            assert result_q.get() == ('success', self.filename)
        assert result_q.empty()
        # Check verify_exists was not called, since it was called in `download_file`.
        assert not mock_verify.called
class Test_download_file:
    locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz']
    filename = 'blah.warc.gz'
    checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403',
                 'md5': '62f87a969af0dd857ecd6c3e7fde6aed'}
    size = 12345678
    data_file = wc.DataFile(locations, filename, checksums, size)

    def test_download_file_200(self):
        session = requests.Session()
        mock_200 = MockResponse200('')

        with patch.object(session, 'get', return_value=mock_200) as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            file_data = wc.download_file(self.data_file, session, self.filename)

        # Check we only tried downloading files until successful download.
        mock_get.assert_called_once_with(self.locations[0], stream=True)
        mock_write_file.assert_called_once_with(mock_200, self.filename)
        assert not file_data.verified

    def test_download_file_not_200(self):
        session = requests.Session()
        mock_403 = MockResponse403()

        with patch.object(session, 'get', return_value=mock_403) as mock_get, \
                pytest.raises(wc.WASAPIDownloadError) as err:
            wc.download_file(self.data_file, session, self.filename)
        for item in (str(self.locations), self.filename):
            assert item in err.value.args[0]
        # Check all locations were tried.
        calls = [call(self.locations[0], stream=True),
                 call(self.locations[1], stream=True)]
        mock_get.assert_has_calls(calls)

    def test_download_get_raises_some_RequestException(self, caplog):
        caplog.set_level(INFO)
        session = requests.Session()
        mock_200 = MockResponse200('')

        with patch.object(session, 'get') as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            # Raise a subclass of RequestException on first download attempt;
            # mock a successful response on the second attempt
            mock_get.side_effect = [requests.exceptions.ConnectionError(),
                                    mock_200]
            wc.download_file(self.data_file, session, self.filename)

        # Check all locations were tried.
        calls = [call(self.locations[0], stream=True),
                 call(self.locations[1], stream=True)]
        mock_get.assert_has_calls(calls)
        mock_write_file.assert_called_once_with(mock_200, self.filename)
        # Verify requests exception was caught and logged.
        for msg in ('Error downloading http://loc1/blah.warc.gz:',
                    'http://loc2/blah.warc.gz: 200 OK'):
            assert msg in caplog.text

    def test_download_file_OSError(self):
        session = requests.Session()
        mock_200 = MockResponse200('')

        with patch.object(session, 'get', return_value=mock_200) as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            mock_write_file.side_effect = OSError
            with pytest.raises(wc.WASAPIDownloadError) as err:
                wc.download_file(self.data_file, session, self.filename)

        for item in (str(self.locations), self.filename):
            assert item in err.value.args[0]
        # Check we only tried downloading files until successful download.
        mock_get.assert_called_once_with(self.locations[0], stream=True)
        mock_write_file.assert_called_once_with(mock_200, self.filename)

    def test_download_check_exists_true(self):
        """Test a file already existing on the filesystem is not downloaded."""
        with patch('wasapi_client.check_exists', return_value=True), \
                patch('requests.Session', autospec=True) as mock_session:
            file_data = wc.download_file(self.data_file, mock_session, self.filename)
        # Check `verified` has been set True on the FileData instance.
        assert file_data.verified
        # Check that no get request was made.
        assert not mock_session.get.called

    def test_download_uses_pre_signed_url(self):
        """Test that an s3 URL uses requests.get, not a session."""
        locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz',
                     'http://loc2/blah.warc.gz']
        filename = 'blah.warc.gz'
        checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'}
        data_file = wc.DataFile(locations, filename, checksums, 123456)
        mock_200 = MockResponse200('')

        with patch('requests.get', return_value=mock_200) as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            wc.download_file(data_file, requests.Session(), filename)

        # Check we attempted one download via requests.get and wrote the file.
        mock_get.assert_called_once_with(locations[0], stream=True)
        mock_write_file.assert_called_once_with(mock_200, filename)
Beispiel #5
0
class Test_download_file:
    locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz']
    filename = 'blah.warc.gz'
    checksums = {
        'sha1': '33304d104f95d826da40079bad2400dc4d005403',
        'md5': '62f87a969af0dd857ecd6c3e7fde6aed'
    }
    size = 12345678
    data_file = wc.DataFile(locations, filename, checksums, size)

    def test_download_file_200(self):
        session = requests.Session()
        mock_200 = MockResponse200('')

        with patch.object(session, 'get', return_value=mock_200) as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            file_data = wc.download_file(self.data_file, session,
                                         self.filename)

        # Check we only tried downloading files until successful download.
        mock_get.assert_called_once_with(self.locations[0], stream=True)
        mock_write_file.assert_called_once_with(mock_200, self.filename)
        assert not file_data.verified

    def test_download_file_not_200(self):
        session = requests.Session()
        mock_403 = MockResponse403()

        with patch.object(session, 'get', return_value=mock_403) as mock_get, \
                pytest.raises(wc.WASAPIDownloadError) as err:
            wc.download_file(self.data_file, session, self.filename)

        for item in (str(self.locations), self.filename):
            assert item in str(err)
        # Check all locations were tried.
        calls = [
            call(self.locations[0], stream=True),
            call(self.locations[1], stream=True)
        ]
        mock_get.assert_has_calls(calls)

    def test_download_file_OSError(self):
        session = requests.Session()
        mock_200 = MockResponse200('')

        with patch.object(session, 'get', return_value=mock_200) as mock_get, \
                patch('wasapi_client.write_file') as mock_write_file:
            mock_write_file.side_effect = OSError
            with pytest.raises(wc.WASAPIDownloadError) as err:
                wc.download_file(self.data_file, session, self.filename)

        for item in (str(self.locations), self.filename):
            assert item in str(err)
        # Check we only tried downloading files until successful download.
        mock_get.assert_called_once_with(self.locations[0], stream=True)
        mock_write_file.assert_called_once_with(mock_200, self.filename)

    def test_download_check_exists_true(self):
        """Test a file already existing on the filesystem is not downloaded."""
        with patch('wasapi_client.check_exists', return_value=True), \
                patch('requests.Session', autospec=True) as mock_session:
            file_data = wc.download_file(self.data_file, mock_session,
                                         self.filename)
        # Check `verified` has been set True on the FileData instance.
        assert file_data.verified
        # Check that no get request was made.
        assert not mock_session.get.called