def test_download_uses_pre_signed_url(self): """Test that an s3 URL uses requests.get, not a session.""" locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz', 'http://loc2/blah.warc.gz'] filename = 'blah.warc.gz' checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'} data_file = wc.DataFile(locations, filename, checksums, 123456) mock_200 = MockResponse200('') with patch('requests.get', return_value=mock_200) as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: wc.download_file(data_file, requests.Session(), filename) # Check we attempted one download via requests.get and wrote the file. mock_get.assert_called_once_with(locations[0], stream=True) mock_write_file.assert_called_once_with(mock_200, filename)
def test_run_file_already_verified(self): """Test a downloaded file is not verified twice.""" return_data_file = wc.DataFile(self.locations, self.filename, self.checksums, self.size) return_data_file.verified = True # Create a queue holding two sets of file data. get_q = multiprocessing.JoinableQueue() for _ in (1, 2): get_q.put(self.data_file) manager = multiprocessing.Manager() result_q = manager.Queue() log_q = manager.Queue() with patch('wasapi_client.verify_file', return_value=True) as mock_verify, \ patch('wasapi_client.download_file', return_value=return_data_file): p = wc.Downloader(get_q, result_q, log_q) p.start() p.run() # If the join doesn't block, the queue is fully processed. get_q.join() assert log_q.empty() for _ in (1, 2): assert result_q.get() == ('success', self.filename) assert result_q.empty() # Check verify_exists was not called, since it was called in `download_file`. assert not mock_verify.called
class TestDownloader: locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz'] filename = 'blah.warc.gz' checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403', 'md5': '62f87a969af0dd857ecd6c3e7fde6aed'} size = 12345678 data_file = wc.DataFile(locations, filename, checksums, size) def test_run(self): """Test downloader when downloads are successful.""" # Create a queue holding two sets of file data. get_q = multiprocessing.JoinableQueue() for _ in (1, 2): get_q.put(self.data_file) manager = multiprocessing.Manager() result_q = manager.Queue() log_q = manager.Queue() with patch('wasapi_client.verify_file', return_value=True), \ patch('wasapi_client.download_file', return_value=self.data_file): p = wc.Downloader(get_q, result_q, log_q) p.start() p.run() # If the join doesn't block, the queue is fully processed. get_q.join() # Verify there is nothing on the log_q. assert log_q.empty() for _ in (1, 2): assert result_q.get() == ('success', self.filename) # Verify those were the only two results on the result_q. assert result_q.empty() @patch('wasapi_client.download_file') def test_run_WASAPIDownloadError(self, mock_download): """Test downloader when downloads fail.""" expected_error = 'WD Error' mock_download.side_effect = wc.WASAPIDownloadError(expected_error) # Create a queue holding two sets of file data. get_q = multiprocessing.JoinableQueue() for _ in (1, 2): get_q.put(self.data_file) manager = multiprocessing.Manager() result_q = manager.Queue() log_q = manager.Queue() p = wc.Downloader(get_q, result_q, log_q) p.start() p.run() # If the join doesn't block, the queue is fully processed. get_q.join() for _ in (1, 2): assert log_q.get().msg == expected_error assert result_q.get() == ('failure', self.filename) # Verify those were the only two results on the result_q. # Sometimes `empty` needs a moment to register. assert result_q.empty() def test_run_file_already_verified(self): """Test a downloaded file is not verified twice.""" return_data_file = wc.DataFile(self.locations, self.filename, self.checksums, self.size) return_data_file.verified = True # Create a queue holding two sets of file data. get_q = multiprocessing.JoinableQueue() for _ in (1, 2): get_q.put(self.data_file) manager = multiprocessing.Manager() result_q = manager.Queue() log_q = manager.Queue() with patch('wasapi_client.verify_file', return_value=True) as mock_verify, \ patch('wasapi_client.download_file', return_value=return_data_file): p = wc.Downloader(get_q, result_q, log_q) p.start() p.run() # If the join doesn't block, the queue is fully processed. get_q.join() assert log_q.empty() for _ in (1, 2): assert result_q.get() == ('success', self.filename) assert result_q.empty() # Check verify_exists was not called, since it was called in `download_file`. assert not mock_verify.called
class Test_download_file: locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz'] filename = 'blah.warc.gz' checksums = {'sha1': '33304d104f95d826da40079bad2400dc4d005403', 'md5': '62f87a969af0dd857ecd6c3e7fde6aed'} size = 12345678 data_file = wc.DataFile(locations, filename, checksums, size) def test_download_file_200(self): session = requests.Session() mock_200 = MockResponse200('') with patch.object(session, 'get', return_value=mock_200) as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: file_data = wc.download_file(self.data_file, session, self.filename) # Check we only tried downloading files until successful download. mock_get.assert_called_once_with(self.locations[0], stream=True) mock_write_file.assert_called_once_with(mock_200, self.filename) assert not file_data.verified def test_download_file_not_200(self): session = requests.Session() mock_403 = MockResponse403() with patch.object(session, 'get', return_value=mock_403) as mock_get, \ pytest.raises(wc.WASAPIDownloadError) as err: wc.download_file(self.data_file, session, self.filename) for item in (str(self.locations), self.filename): assert item in err.value.args[0] # Check all locations were tried. calls = [call(self.locations[0], stream=True), call(self.locations[1], stream=True)] mock_get.assert_has_calls(calls) def test_download_get_raises_some_RequestException(self, caplog): caplog.set_level(INFO) session = requests.Session() mock_200 = MockResponse200('') with patch.object(session, 'get') as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: # Raise a subclass of RequestException on first download attempt; # mock a successful response on the second attempt mock_get.side_effect = [requests.exceptions.ConnectionError(), mock_200] wc.download_file(self.data_file, session, self.filename) # Check all locations were tried. calls = [call(self.locations[0], stream=True), call(self.locations[1], stream=True)] mock_get.assert_has_calls(calls) mock_write_file.assert_called_once_with(mock_200, self.filename) # Verify requests exception was caught and logged. for msg in ('Error downloading http://loc1/blah.warc.gz:', 'http://loc2/blah.warc.gz: 200 OK'): assert msg in caplog.text def test_download_file_OSError(self): session = requests.Session() mock_200 = MockResponse200('') with patch.object(session, 'get', return_value=mock_200) as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: mock_write_file.side_effect = OSError with pytest.raises(wc.WASAPIDownloadError) as err: wc.download_file(self.data_file, session, self.filename) for item in (str(self.locations), self.filename): assert item in err.value.args[0] # Check we only tried downloading files until successful download. mock_get.assert_called_once_with(self.locations[0], stream=True) mock_write_file.assert_called_once_with(mock_200, self.filename) def test_download_check_exists_true(self): """Test a file already existing on the filesystem is not downloaded.""" with patch('wasapi_client.check_exists', return_value=True), \ patch('requests.Session', autospec=True) as mock_session: file_data = wc.download_file(self.data_file, mock_session, self.filename) # Check `verified` has been set True on the FileData instance. assert file_data.verified # Check that no get request was made. assert not mock_session.get.called def test_download_uses_pre_signed_url(self): """Test that an s3 URL uses requests.get, not a session.""" locations = ['https://data.s3.amazonaws.com/warcs/blah.warc.gz?Signature=xyz', 'http://loc2/blah.warc.gz'] filename = 'blah.warc.gz' checksums = {'md5': '72b484a2610cb54ec22e48c8104ba3bd'} data_file = wc.DataFile(locations, filename, checksums, 123456) mock_200 = MockResponse200('') with patch('requests.get', return_value=mock_200) as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: wc.download_file(data_file, requests.Session(), filename) # Check we attempted one download via requests.get and wrote the file. mock_get.assert_called_once_with(locations[0], stream=True) mock_write_file.assert_called_once_with(mock_200, filename)
class Test_download_file: locations = ['http://loc1/blah.warc.gz', 'http://loc2/blah.warc.gz'] filename = 'blah.warc.gz' checksums = { 'sha1': '33304d104f95d826da40079bad2400dc4d005403', 'md5': '62f87a969af0dd857ecd6c3e7fde6aed' } size = 12345678 data_file = wc.DataFile(locations, filename, checksums, size) def test_download_file_200(self): session = requests.Session() mock_200 = MockResponse200('') with patch.object(session, 'get', return_value=mock_200) as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: file_data = wc.download_file(self.data_file, session, self.filename) # Check we only tried downloading files until successful download. mock_get.assert_called_once_with(self.locations[0], stream=True) mock_write_file.assert_called_once_with(mock_200, self.filename) assert not file_data.verified def test_download_file_not_200(self): session = requests.Session() mock_403 = MockResponse403() with patch.object(session, 'get', return_value=mock_403) as mock_get, \ pytest.raises(wc.WASAPIDownloadError) as err: wc.download_file(self.data_file, session, self.filename) for item in (str(self.locations), self.filename): assert item in str(err) # Check all locations were tried. calls = [ call(self.locations[0], stream=True), call(self.locations[1], stream=True) ] mock_get.assert_has_calls(calls) def test_download_file_OSError(self): session = requests.Session() mock_200 = MockResponse200('') with patch.object(session, 'get', return_value=mock_200) as mock_get, \ patch('wasapi_client.write_file') as mock_write_file: mock_write_file.side_effect = OSError with pytest.raises(wc.WASAPIDownloadError) as err: wc.download_file(self.data_file, session, self.filename) for item in (str(self.locations), self.filename): assert item in str(err) # Check we only tried downloading files until successful download. mock_get.assert_called_once_with(self.locations[0], stream=True) mock_write_file.assert_called_once_with(mock_200, self.filename) def test_download_check_exists_true(self): """Test a file already existing on the filesystem is not downloaded.""" with patch('wasapi_client.check_exists', return_value=True), \ patch('requests.Session', autospec=True) as mock_session: file_data = wc.download_file(self.data_file, mock_session, self.filename) # Check `verified` has been set True on the FileData instance. assert file_data.verified # Check that no get request was made. assert not mock_session.get.called