def test_check_all_urls(self): """Should check all the URLs for one provider""" mock_lock = mock.Mock() with mock.patch('geospaas_harvesting.verify_urls.Lock', return_value=mock_lock), \ mock.patch( 'geospaas_harvesting.verify_urls.BoundedThreadPoolExecutor') as mock_pool, \ mock.patch('geospaas_harvesting.verify_urls.DatasetURI.objects') as mock_manager, \ mock.patch('concurrent.futures.as_completed'), \ mock.patch('geospaas_harvesting.verify_urls.HTTPProvider' '.check_and_write_stale_url') as mock_write: mock_executor = mock_pool.return_value.__enter__.return_value mock_dataset_uri = mock.Mock() mock_manager.filter.return_value.iterator.return_value = [mock_dataset_uri] # call without throttle: 50 workers provider = verify_urls.HTTPProvider('test', {'url': 'https://foo/'}) with self.assertLogs(verify_urls.logger, level=logging.INFO): provider.check_all_urls('output.txt') mock_executor.submit.assert_called_once_with( mock_write, mock_lock, 'output.txt', mock_dataset_uri) mock_pool.assert_called_once_with(max_workers=50, queue_limit=2000) mock_pool.reset_mock() # call with throttle: 1 worker provider = verify_urls.HTTPProvider('test', {'url': 'https://foo/', 'throttle': 1}) with self.assertLogs(verify_urls.logger, level=logging.INFO): provider.check_all_urls('output.txt') mock_executor.submit.assert_called_once_with( mock_write, mock_lock, 'output.txt', mock_dataset_uri) mock_pool.assert_called_once_with(max_workers=1, queue_limit=2000) mock_pool.reset_mock()
def test_read_config(self): """Should read the provider configuration from a YAML file""" config = textwrap.dedent('''--- podaac: url: 'https://opendap.jpl.nasa.gov/opendap/' scihub: url: 'https://scihub.copernicus.eu/' username: !ENV 'COPERNICUS_OPEN_HUB_USERNAME' password: !ENV 'COPERNICUS_OPEN_HUB_PASSWORD' creodias: url: 'https://zipper.creodias.eu/' username: !ENV 'CREODIAS_USERNAME' password: !ENV 'CREODIAS_PASSWORD' token_url: 'https://auth.creodias.eu/auth/realms/DIAS/protocol/openid-connect/token' client_id: 'CLOUDFERRO_PUBLIC' throttle: 1 auth_renew: 36000 rtofs: url: 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/rtofs/prod/' ''') environment = { 'COPERNICUS_OPEN_HUB_USERNAME': '******', 'COPERNICUS_OPEN_HUB_PASSWORD': '******', 'CREODIAS_USERNAME': '******', 'CREODIAS_PASSWORD': '******', } # we check that get_auth() is called with the right arguments # by replacing its output by its arguments with mock.patch('geospaas_harvesting.verify_urls.open', mock.mock_open(read_data=config)), \ mock.patch('os.environ', environment): providers = verify_urls.read_config('foo.yml') self.assertListEqual(providers, [ verify_urls.HTTPProvider('podaac', { 'url': 'https://opendap.jpl.nasa.gov/opendap/', }), verify_urls.HTTPProvider('scihub', { 'url': 'https://scihub.copernicus.eu/', 'username': '******', 'password': '******' }), verify_urls.HTTPProvider('creodias', { 'url': 'https://zipper.creodias.eu/', 'username': '******', 'password': '******', 'token_url': 'https://auth.creodias.eu/auth/realms/DIAS/protocol/' 'openid-connect/token', 'client_id': 'CLOUDFERRO_PUBLIC', 'throttle': 1, 'auth_renew': 36000 }), verify_urls.FTPProvider('rtofs', { 'url': 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/rtofs/prod/' }) ])
def test_check_providers(self): """Should run URL checks for each provider in a separate process. If an exception is raised in one of the sub-processes, check_providers() should return False and the traceback of the exception should be logged """ providers = [ verify_urls.HTTPProvider('scihub', { 'url': 'https://scihub.copernicus.eu/', 'username': '******', 'password': '******', 'throttle': 0 }), verify_urls.HTTPProvider('podaac', { 'url': 'https://opendap.jpl.nasa.gov/opendap/', 'username': '******', 'password': '******', 'throttle': 0 }), verify_urls.FTPProvider('rtofs', { 'url': 'ftp://ftpprd.ncep.noaa.gov/pub/data/nccf/com/rtofs/prod/' }), ] with mock.patch('concurrent.futures.ProcessPoolExecutor') as mock_pool, \ mock.patch('geospaas_harvesting.verify_urls.datetime') as mock_datetime, \ mock.patch('geospaas_harvesting.verify_urls.' 'HTTPProvider.check_all_urls') as mock_http_check, \ mock.patch('geospaas_harvesting.verify_urls.' 'FTPProvider.check_all_urls') as mock_ftp_check, \ mock.patch('concurrent.futures.as_completed', iter): mock_executor = mock_pool.return_value.__enter__.return_value mock_datetime.now.return_value.strftime.return_value = 'time' self.assertTrue(verify_urls.check_providers('foo', providers)) mock_executor.submit.assert_has_calls(( mock.call( mock_http_check, os.path.join('foo', 'scihub_stale_urls_time.txt')), mock.call( mock_http_check, os.path.join('foo', 'podaac_stale_urls_time.txt')), mock.call( mock_ftp_check, os.path.join('foo', 'rtofs_stale_urls_time.txt')) ), any_order=True) self.assertEqual(len(mock_executor.submit.call_args_list), 3) mock_executor.submit.return_value.result.side_effect = AttributeError with self.assertLogs(verify_urls.logger, level=logging.ERROR): self.assertFalse(verify_urls.check_providers('foo', providers))
def test_auth_basic(self): """The auth property should return the right authentication object based on the provider attributes """ provider = verify_urls.HTTPProvider('test', {'username': '******', 'password': '******'}) self.assertEqual( provider.auth, requests.auth.HTTPBasicAuth('user', 'pass'))
def test_check_url_200(self): """Should send a HEAD request to the URL and return whether the URL is valid or not. """ provider = verify_urls.HTTPProvider('test', {}) mock_dataset_uri = mock.Mock(id=1, uri='https://foo') mock_response = mock.MagicMock(status_code=200, headers={}) with mock.patch('geospaas_harvesting.utils.http_request', return_value=mock_response): self.assertEqual(provider.check_url(mock_dataset_uri), verify_urls.PRESENT)
def test_check_url_http_error(self): """Should send a HEAD request to the URL and return 'http_<error_code>' if an error code other than 404 is received """ provider = verify_urls.HTTPProvider('test', {}) mock_dataset_uri = mock.Mock(id=1, uri='https://foo') mock_response = mock.MagicMock(status_code=503, headers={}) with mock.patch('geospaas_harvesting.utils.http_request', return_value=mock_response) as mock_request: self.assertEqual(provider.check_url(mock_dataset_uri), 'http_503') mock_request.assert_called_once()
def test_check_and_write_stale_url_valid(self): """Should not write anything to the output file if the URL is valid """ provider = verify_urls.HTTPProvider('test', {}) mock_lock = mock.MagicMock() with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url', return_value=verify_urls.PRESENT), \ mock.patch('geospaas_harvesting.verify_urls.open') as mock_open: provider.check_and_write_stale_url(mock_lock, 'output.txt', mock.Mock()) mock_open.assert_not_called()
def test_check_all_urls_thread_error(self): """Exceptions happening in the threads should be raised in the main thread """ provider = verify_urls.HTTPProvider('test', {'url': 'https://foo'}) with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider' '.check_and_write_stale_url') as mock_write, \ mock.patch('geospaas_harvesting.verify_urls.DatasetURI.objects') as mock_manager: mock_write.side_effect = ValueError mock_manager.filter.return_value.iterator.return_value = [mock.Mock()] with self.assertRaises(ValueError), \ self.assertLogs(verify_urls.logger, level=logging.INFO): provider.check_all_urls('out.txt')
def test_delete_stale_urls(self): """404 URLs should be deleted unless the force option is used """ provider = verify_urls.HTTPProvider('test', { 'url': 'https://foo', 'username': '******', 'password': '******', 'auth_renew': -1 }) file_contents = f'{verify_urls.ABSENT} 12 https://foo/bar\nhttp_500 13 https://foo/baz' check_url_results = (verify_urls.ABSENT, 'http_500') dataset_uris = {12: 'https://foo/bar', 13: 'https://foo/baz'} mock_manager = mock.Mock() mock_manager.filter.side_effect = lambda id: [mock.Mock(uri=dataset_uris.get(id))] with mock.patch('geospaas_harvesting.verify_urls.find_provider', return_value=provider), \ mock.patch('geospaas_harvesting.verify_urls.DatasetURI.objects', mock_manager): # force == False, only the URL that returns 404 must be # deleted buffer = io.StringIO(file_contents) with mock.patch('geospaas_harvesting.verify_urls.open', return_value=buffer), \ mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url', side_effect=check_url_results), \ mock.patch('geospaas_harvesting.verify_urls.remove_dataset_uri', return_value=(True, True)) as mock_remove: self.assertEqual(verify_urls.delete_stale_urls('', {}, force=False), (1, 1)) self.assertListEqual( [args[0][0].uri for args in mock_remove.call_args_list], ['https://foo/bar']) # force == True, both URLs must be deleted buffer = io.StringIO(file_contents) with mock.patch('geospaas_harvesting.verify_urls.open', return_value=buffer), \ mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url', side_effect=check_url_results), \ mock.patch('geospaas_harvesting.verify_urls.remove_dataset_uri', return_value=(True, True)) as mock_remove: self.assertEqual(verify_urls.delete_stale_urls('', {}, force=True), (2, 2)) self.assertListEqual( [args[0][0].uri for args in mock_remove.call_args_list], ['https://foo/bar', 'https://foo/baz']) # The URI does not exist buffer = io.StringIO(file_contents) with mock.patch('geospaas_harvesting.verify_urls.open', return_value=buffer): mock_manager.filter.side_effect = None mock_manager.filter.return_value = [] with self.assertLogs(verify_urls.logger, level=logging.WARNING): self.assertEqual(verify_urls.delete_stale_urls('', {}, force=False), (0, 0))
def test_find_provider(self): """Should return the right provider given a URL""" scihub_provider = verify_urls.HTTPProvider('scihub', { 'url': 'https://scihub.copernicus.eu/', 'username': '******', 'password': '******', 'throttle': 0 }) podaac_provider = verify_urls.HTTPProvider('podaac', { 'url': 'https://opendap.jpl.nasa.gov/opendap/', 'username': '******', 'password': '******', 'throttle': 0 }) providers = [scihub_provider, podaac_provider] self.assertIsNone(verify_urls.find_provider('foo.txt', providers)) self.assertEqual( verify_urls.find_provider('scihub_stale_urls_2021-05-25T10:22:27.txt', providers), scihub_provider) self.assertEqual( verify_urls.find_provider('podaac_stale_urls_2021-05-25T10:22:28.txt', providers), podaac_provider)
def test_check_url_connection_error_retry(self): """The request should be retried if a ConnectionError occurs""" provider = verify_urls.HTTPProvider('test', {}) mock_dataset_uri = mock.Mock(id=1, uri='https://foo') with mock.patch('geospaas_harvesting.utils.http_request') as mock_request, \ mock.patch('time.sleep') as mock_sleep: mock_request.side_effect = ( requests.exceptions.ConnectionError, requests.exceptions.ConnectionError, mock.MagicMock(status_code=200, headers={}) ) with self.assertLogs(verify_urls.logger, level=logging.ERROR): provider.check_url(mock_dataset_uri, tries=5) self.assertListEqual(mock_sleep.call_args_list, [mock.call(5), mock.call(5), mock.call(0)])
def test_check_url_429_too_many_retries(self): """When there are too many retries, an exception should be raised """ provider = verify_urls.HTTPProvider('test', {}) mock_dataset_uri = mock.Mock(id=1, uri='https://foo') mock_responses = ( mock.MagicMock(status_code=429, headers={}), mock.MagicMock(status_code=200, headers={}) ) with mock.patch('geospaas_harvesting.utils.http_request', side_effect=mock_responses) as mock_request: with self.assertRaises(verify_urls.TooManyRequests): provider.check_url(mock_dataset_uri, tries=1) mock_request.assert_called_once()
def test_check_and_write_stale_url_invalid(self): """Should write the URL info to the output file if the URL is invalid """ provider = verify_urls.HTTPProvider('test', {}) with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.check_url', return_value=verify_urls.ABSENT), \ mock.patch('geospaas_harvesting.verify_urls.open') as mock_open: mock_file = mock.MagicMock() mock_open.return_value.__enter__.return_value = mock_file mock_dataset_uri = mock.Mock() mock_dataset_uri.id = 1 mock_dataset_uri.uri = 'https://foo' provider.check_and_write_stale_url(mock.MagicMock(), 'output.txt', mock_dataset_uri) mock_file.write.assert_called_once_with( f"{verify_urls.ABSENT} 1 https://foo{os.linesep}")
def test_auth_oauth2(self): """The auth property should return the right authentication object based on the provider attributes """ provider = verify_urls.HTTPProvider('test', { 'username': '******', 'password': '******', 'token_url': 'https://foo', 'client_id': 'CLIENT' }) mock_oauth2 = mock.Mock() with mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.build_oauth2', return_value=mock_oauth2) as mock_build_oauth2: self.assertEqual( provider.auth, mock_oauth2) mock_build_oauth2.assert_called_once_with('user', 'pass', 'https://foo', 'CLIENT')
def test_check_url_connection_error_too_many_retries(self): """The request should be retried if a ConnectionError occurs and the exception should be raised if the retry limit is reached """ provider = verify_urls.HTTPProvider('test', {}) mock_dataset_uri = mock.Mock(id=1, uri='https://foo') with mock.patch('geospaas_harvesting.utils.http_request') as mock_request, \ mock.patch('time.sleep') as mock_sleep: mock_request.side_effect = ( requests.exceptions.ConnectionError, requests.exceptions.ConnectionError, ) with self.assertLogs(verify_urls.logger, level=logging.ERROR), \ self.assertRaises(requests.exceptions.ConnectionError): provider.check_url(mock_dataset_uri, tries=2) self.assertListEqual(mock_sleep.call_args_list, [mock.call(5)])
def test_auth_renew(self): """Test that authentication is renewed when necessary""" provider = verify_urls.HTTPProvider('test', { 'username': '******', 'password': '******', 'token_url': 'token', 'client_id': 'ID', 'auth_renew': 1 }) with mock.patch('time.monotonic', side_effect=(1, 2, 2.1)), \ mock.patch('geospaas_harvesting.verify_urls.HTTPProvider.build_oauth2', side_effect=('auth1', 'auth2', 'auth3')): # First call -> first return value from build_oauth2() self.assertEqual(provider.auth, 'auth1') # Second call, one second later -> second return value from build_oauth2() self.assertEqual(provider.auth, 'auth2') # Third call, less than one second later -> the value does not change self.assertEqual(provider.auth, 'auth2')
def test_check_url_429_no_header(self): """When an error 429 occurs, the URL should ne retried after a delay """ provider = verify_urls.HTTPProvider('test', {}) mock_dataset_uri = mock.Mock(id=1, uri='https://foo') mock_responses = ( mock.MagicMock(status_code=429, headers={}), mock.MagicMock(status_code=404, headers={}) ) with mock.patch('geospaas_harvesting.utils.http_request', side_effect=mock_responses) as mock_request, \ mock.patch('time.sleep') as mock_sleep: with self.assertLogs(verify_urls.logger, level=logging.WARNING): self.assertEqual(provider.check_url(mock_dataset_uri),verify_urls.ABSENT) self.assertEqual(mock_request.call_count, 2) self.assertListEqual(mock_sleep.call_args_list, [mock.call(60), mock.call(0)])
def test_instantiation(self): """Test that the attributes are correctly initialized""" provider = verify_urls.HTTPProvider('test', {'foo': 'bar'}) self.assertEqual(provider.name, 'test') self.assertEqual(provider.config, {'foo': 'bar'}) self.assertEqual(provider._auth_start, None)
def test_auth_no_auth(self): """The auth property should return None when no authentication method can be determined """ provider = verify_urls.HTTPProvider('test', {}) self.assertIsNone(provider.auth)