def testFindCreateClusterEnvSucceed(self): # No existing compute, create new, and succeed self.cluster_manager.set_cluster_env(self.cluster_env) self.assertTrue(self.cluster_manager.cluster_env_name) self.assertFalse(self.cluster_manager.cluster_env_id) self.sdk.returns["search_cluster_environments"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_environment"] = APIDict( result=APIDict(id="correct", )) self.cluster_manager.create_cluster_env() # Both APIs were called twice (retry after fail) self.assertEqual(self.cluster_manager.cluster_env_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 1) self.assertEqual(self.sdk.call_counter["create_cluster_environment"], 1) self.assertEqual(len(self.sdk.call_counter), 2)
def testBuildClusterEnvPreBuildSucceeded(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # (Second) build succeeded self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ), APIDict( id="build_succeeded", status="succeeded", created_at=1, ), ]) self.cluster_manager.build_cluster_env(timeout=600) self.assertTrue(self.cluster_manager.cluster_env_build_id) self.assertEqual(self.cluster_manager.cluster_env_build_id, "build_succeeded") self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertEqual(len(self.sdk.call_counter), 1)
def testFindCreateClusterEnvFailFail(self): # No existing compute, create new, but fail both times self.cluster_manager.set_cluster_env(self.cluster_env) self.assertTrue(self.cluster_manager.cluster_env_name) self.assertFalse(self.cluster_manager.cluster_env_id) self.sdk.returns["search_cluster_environments"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_environment"] = fail_always with self.assertRaises(ClusterEnvCreateError): self.cluster_manager.create_cluster_env() # No cluster ID found or created self.assertFalse(self.cluster_manager.cluster_env_id) # Both APIs were called twice (retry after fail) self.assertEqual(self.sdk.call_counter["search_cluster_environments"], 2) self.assertEqual(self.sdk.call_counter["create_cluster_environment"], 2) self.assertEqual(len(self.sdk.call_counter), 2)
def testFindCreateClusterComputeCreateFailSucceed(self): # No existing compute, create new, fail once, succeed afterwards self.cluster_manager.set_cluster_compute(self.cluster_compute) self.assertTrue(self.cluster_manager.cluster_compute_name) self.assertFalse(self.cluster_manager.cluster_compute_id) self.sdk.returns["search_cluster_computes"] = APIDict( metadata=APIDict( next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), ], ) self.sdk.returns["create_cluster_compute"] = fail_once( result=APIDict( result=APIDict( id="correct", ) ) ) self.cluster_manager.create_cluster_compute() # Both APIs were called twice (retry after fail) self.assertEqual(self.cluster_manager.cluster_compute_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 2) self.assertEqual(self.sdk.call_counter["create_cluster_compute"], 2) self.assertEqual(len(self.sdk.call_counter), 2)
def testSessionStartStartupError(self): self.cluster_manager.cluster_env_id = "correct" self.cluster_manager.cluster_compute_id = "correct" self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success")) self.sdk.returns["start_cluster"] = _fail with self.assertRaises(ClusterStartupError): self.cluster_manager.start_cluster()
def setUp(self) -> None: self.sdk = MockSDK() self.sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) self.cluster_env = TEST_CLUSTER_ENV self.cluster_compute = TEST_CLUSTER_COMPUTE self.cluster_manager = self.cls( project_id=UNIT_TEST_PROJECT_ID, sdk=self.sdk, test_name=f"unit_test__{self.__class__.__name__}", ) self.sdk.reset()
def testClusterName(self): sdk = MockSDK() sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) cluster_manager = self.cls(test_name="test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=False, sdk=sdk) self.assertRegex(cluster_manager.cluster_name, r"^test_\d+$") cluster_manager = self.cls(test_name="test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=True, sdk=sdk) self.assertRegex(cluster_manager.cluster_name, r"^test-smoke-test_\d+$")
def testInvalidClusterIdOverride(self): result = Result() self._succeed_until("driver_setup") self.sdk.returns["get_cluster_environment"] = None with self.assertRaises(ClusterEnvCreateError): self._run(result, cluster_env_id="existing") self.sdk.returns["get_cluster_environment"] = APIDict(result=APIDict( config_json={"overridden": True})) with self.assertRaises(Exception) as cm: # Fail somewhere else self._run(result, cluster_env_id="existing") self.assertNotIsInstance(cm.exception, ClusterEnvCreateError)
def testBuildClusterEnvNotFound(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Environment build not found self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[]) with self.assertRaisesRegex(ClusterEnvBuildError, "No build found"): self.cluster_manager.build_cluster_env(timeout=600)
def testBuildClusterBuildSucceed(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build, succeed after 300 seconds self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ), APIDict( id="build_succeeded", status="pending", created_at=1, ), ]) with freeze_time() as frozen_time: self.sdk.returns["get_build"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict(status="in_progress")), after=APIDict(result=APIDict(status="succeeded")), ) self.cluster_manager.build_cluster_env(timeout=600) self.assertTrue(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9) self.assertEqual(len(self.sdk.call_counter), 2)
def testBuildClusterEnvPreBuildFailed(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build failed on first lookup self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, ) ]) with self.assertRaisesRegex(ClusterEnvBuildError, "Cluster env build failed"): self.cluster_manager.build_cluster_env(timeout=600) self.assertFalse(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertEqual(len(self.sdk.call_counter), 1)
def testFindCreateClusterComputeExisting(self): # Find existing compute and succeed self.cluster_manager.set_cluster_compute(self.cluster_compute) self.assertTrue(self.cluster_manager.cluster_compute_name) self.assertFalse(self.cluster_manager.cluster_compute_id) self.sdk.returns["search_cluster_computes"] = APIDict( metadata=APIDict(next_paging_token=None, ), results=[ APIDict( name="no_match", id="wrong", ), APIDict(name=self.cluster_manager.cluster_compute_name, id="correct"), ], ) self.cluster_manager.create_cluster_compute() self.assertEqual(self.cluster_manager.cluster_compute_id, "correct") self.assertEqual(self.sdk.call_counter["search_cluster_computes"], 1) self.assertEqual(len(self.sdk.call_counter), 1)
def testBuildClusterEnvPreBuildFailed(self): """Pre-build fails, but is kicked off again.""" self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build failed on first lookup self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, error_message=None, config_json={}, ) ]) self.sdk.returns["create_cluster_environment_build"] = APIDict( result=APIDict(id="new_build_id")) self.sdk.returns["get_build"] = APIDict(result=APIDict( id="build_now_succeeded", status="failed", created_at=0, error_message=None, config_json={}, )) with self.assertRaisesRegex(ClusterEnvBuildError, "Cluster env build failed"): self.cluster_manager.build_cluster_env(timeout=600) self.assertFalse(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertEqual( self.sdk.call_counter["create_cluster_environment_build"], 1) self.assertEqual(len(self.sdk.call_counter), 3)
def testSessionStartStartupSuccess(self): self.cluster_manager.cluster_env_id = "correct" self.cluster_manager.cluster_compute_id = "correct" self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success")) self.sdk.returns["start_cluster"] = APIDict( result=APIDict(id="cop_id", completed=False) ) with freeze_time() as frozen_time: frozen_time.tick(delta=0.1) self.sdk.returns["get_cluster_operation"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict(completed=False)), after=APIDict(result=APIDict(completed=True)), ) self.sdk.returns["get_cluster"] = APIDict(result=APIDict(state="Running")) # Timeout is long enough self.cluster_manager.start_cluster(timeout=400)
def testSetClusterEnv(self): sdk = MockSDK() sdk.returns["get_project"] = APIDict(result=APIDict( name="release_unit_tests")) cluster_manager = self.cls(test_name="test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=False, sdk=sdk) cluster_manager.set_cluster_env({}) self.assertEqual( cluster_manager.cluster_env["env_vars"] ["RAY_USAGE_STATS_EXTRA_TAGS"], "test_name=test;smoke_test=False", ) cluster_manager = self.cls(test_name="Test", project_id=UNIT_TEST_PROJECT_ID, smoke_test=True, sdk=sdk) cluster_manager.set_cluster_env({}) self.assertEqual( cluster_manager.cluster_env["env_vars"] ["RAY_USAGE_STATS_EXTRA_TAGS"], "test_name=Test;smoke_test=True", )
def testSessionStartStartupTimeout(self): self.cluster_manager.cluster_env_id = "correct" self.cluster_manager.cluster_compute_id = "correct" self.sdk.returns["create_cluster"] = APIDict(result=APIDict(id="success")) self.sdk.returns["start_cluster"] = APIDict( result=APIDict(id="cop_id", completed=False) ) with freeze_time() as frozen_time, self.assertRaises(ClusterStartupTimeout): self.sdk.returns["get_cluster_operation"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict(completed=False)), after=APIDict(result=APIDict(completed=True)), ) # Timeout before startup finishes self.cluster_manager.start_cluster(timeout=200)
def testBuildClusterEnvBuildTimeout(self): self.cluster_manager.set_cluster_env(self.cluster_env) self.cluster_manager.cluster_env_id = "correct" # Build, but timeout after 100 seconds self.cluster_manager.cluster_env_build_id = None self.sdk.reset() self.sdk.returns["list_cluster_environment_builds"] = APIDict(results=[ APIDict( id="build_failed", status="failed", created_at=0, error_message=None, config_json={}, ), APIDict( id="build_succeeded", status="pending", created_at=1, error_message=None, config_json={}, ), ]) with freeze_time() as frozen_time, self.assertRaisesRegex( ClusterEnvBuildTimeout, "Time out when building cluster env"): self.sdk.returns["get_build"] = _DelayedResponse( lambda: frozen_time.tick(delta=10), finish_after=300, before=APIDict(result=APIDict( status="in_progress", error_message=None, config_json={})), after=APIDict(result=APIDict( status="succeeded", error_message=None, config_json={})), ) self.cluster_manager.build_cluster_env(timeout=100) self.assertFalse(self.cluster_manager.cluster_env_build_id) self.assertEqual( self.sdk.call_counter["list_cluster_environment_builds"], 1) self.assertGreaterEqual(self.sdk.call_counter["get_build"], 9) self.assertEqual(len(self.sdk.call_counter), 2)
def setUp(self) -> None: self.tempdir = tempfile.mkdtemp() self.sdk = MockSDK() self.sdk.returns["get_project"] = APIDict( result=APIDict(name="unit_test_project") ) self.writeClusterEnv("{'env': true}") self.writeClusterCompute("{'compute': true}") with open(os.path.join(self.tempdir, "driver_fail.sh"), "wt") as f: f.write("exit 1\n") with open(os.path.join(self.tempdir, "driver_succeed.sh"), "wt") as f: f.write("exit 0\n") this_sdk = self.sdk this_tempdir = self.tempdir self.cluster_manager_return = {} self.command_runner_return = {} self.file_manager_return = {} this_cluster_manager_return = self.cluster_manager_return this_command_runner_return = self.command_runner_return this_file_manager_return = self.file_manager_return class MockClusterManager(MockReturn, FullClusterManager): def __init__( self, test_name: str, project_id: str, sdk=None, smoke_test: bool = False, ): super(MockClusterManager, self).__init__( test_name, project_id, this_sdk, smoke_test=smoke_test ) self.return_dict = this_cluster_manager_return class MockCommandRunner(MockReturn, CommandRunner): return_dict = self.cluster_manager_return def __init__( self, cluster_manager: ClusterManager, file_manager: FileManager, working_dir: str, ): super(MockCommandRunner, self).__init__( cluster_manager, file_manager, this_tempdir ) self.return_dict = this_command_runner_return class MockFileManager(MockReturn, FileManager): def __init__(self, cluster_manager: ClusterManager): super(MockFileManager, self).__init__(cluster_manager) self.return_dict = this_file_manager_return self.mock_alert_return = None def mock_alerter(test: Test, result: Result): return self.mock_alert_return result_to_handle_map["unit_test_alerter"] = mock_alerter type_str_to_command_runner["unit_test"] = MockCommandRunner command_runner_to_cluster_manager[MockCommandRunner] = MockClusterManager command_runner_to_file_manager[MockCommandRunner] = MockFileManager self.test = Test( name="unit_test_end_to_end", run=dict( type="unit_test", prepare="prepare_cmd", script="test_cmd", wait_for_nodes=dict(num_nodes=4, timeout=40), ), working_dir=self.tempdir, cluster=dict( cluster_env="cluster_env.yaml", cluster_compute="cluster_compute.yaml" ), alert="unit_test_alerter", driver_setup="driver_fail.sh", ) self.anyscale_project = "prj_unit12345678" self.ray_wheels_url = "http://mock.wheels/"