def test_job_level_gc(start_cluster, field, spec_format, tmp_path): """Tests that job-level conda env is GC'd when the job exits.""" # We must use a single-node cluster. If we simulate a multi-node cluster # then the conda installs will proceed simultaneously, one on each node, # but since they're actually running on the same machine we get errors. cluster, address = start_cluster ray.init( address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path) ) @ray.remote def f(): import pip_install_test # noqa: F401 return True # Ensure that the runtime env has been installed. assert ray.get(f.remote()) assert not check_local_files_gced(cluster) ray.shutdown() wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30) # Check that we can reconnect with the same env. (In other words, ensure # the conda env was fully deleted and not left in some kind of corrupted # state that prevents reinstalling the same conda env.) ray.init( address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path) ) assert ray.get(f.remote())
def test_detached_actor_gc( self, runtime_env_disable_URI_cache, start_cluster, field, spec_format, tmp_path ): """Tests that detached actor's conda env is GC'd only when it exits.""" cluster, address = start_cluster ray.init( address, namespace="test", runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path), ) @ray.remote class A: def test_import(self): import pip_install_test # noqa: F401 return True a = A.options(name="test", lifetime="detached").remote() ray.get(a.test_import.remote()) assert not check_local_files_gced(cluster) ray.shutdown() ray.init(address, namespace="test") assert not check_local_files_gced(cluster) a = ray.get_actor("test") assert ray.get(a.test_import.remote()) ray.kill(a) wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30)
def test_actor_level_gc(self, runtime_env_disable_URI_cache, start_cluster, field, spec_format, tmp_path): """Tests that actor-level working_dir is GC'd when the actor exits.""" cluster, address = start_cluster ray.init(address) runtime_env = generate_runtime_env_dict(field, spec_format, tmp_path) @ray.remote class A: def test_import(self): import pip_install_test # noqa: F401 return True NUM_ACTORS = 5 actors = [ A.options(runtime_env=runtime_env).remote() for _ in range(NUM_ACTORS) ] ray.get([a.test_import.remote() for a in actors]) for i in range(5): assert not check_local_files_gced(cluster) ray.kill(actors[i]) wait_for_condition(lambda: check_local_files_gced(cluster))
def test_task_level_gc(ray_start_cluster, field, spec_format, tmp_path): """Tests that task-level working_dir is GC'd when the task exits.""" cluster = ray_start_cluster soft_limit_zero = False system_config = cluster.list_all_nodes()[0]._ray_params._system_config if ("num_workers_soft_limit" in system_config and system_config["num_workers_soft_limit"] == 0): soft_limit_zero = True runtime_env = generate_runtime_env_dict(field, spec_format, tmp_path) @ray.remote def f(): import pip_install_test # noqa: F401 return True @ray.remote class A: def test_import(self): import pip_install_test # noqa: F401 return True # Start a task with runtime env ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a actor with runtime env actor = A.options(runtime_env=runtime_env).remote() ray.get(actor.test_import.remote()) # Local files should not be gced assert not check_local_files_gced(cluster) # Kill actor ray.kill(actor) if soft_limit_zero: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a task with runtime env ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster)
def test_install_failure_logging( start_cluster, specify_env_in_init, field, spec_format, tmp_path, ): cluster, address = start_cluster using_ray_client = address.startswith("ray://") bad_envs: Dict[str, Dict] = {} bad_packages: Dict[str, str] = {} for scope in "init", "actor", "task": bad_packages[scope] = "doesnotexist" + scope bad_envs[scope] = generate_runtime_env_dict( field, spec_format, tmp_path, pip_list=[bad_packages[scope]]) if specify_env_in_init: if using_ray_client: with pytest.raises(ConnectionAbortedError) as excinfo: ray.init(address, runtime_env=bad_envs["init"]) assert bad_packages["init"] in str(excinfo.value) else: ray.init(address, runtime_env=bad_envs["init"]) @ray.remote def g(): pass with pytest.raises(RuntimeEnvSetupError, match=bad_packages["init"]): ray.get(g.remote()) return ray.init(address) @ray.remote(runtime_env=bad_envs["actor"]) class A: def f(self): pass a = A.remote() # noqa with pytest.raises(RuntimeEnvSetupError, match=bad_packages["actor"]): ray.get(a.f.remote()) @ray.remote(runtime_env=bad_envs["task"]) def f(): pass with pytest.raises(RuntimeEnvSetupError, match=bad_packages["task"]): ray.get(f.remote())
def test_skip_local_gc_env_var(self, skip_local_gc, start_cluster, field, tmp_path): cluster, address = start_cluster runtime_env = generate_runtime_env_dict(field, "python_object", tmp_path) ray.init(address, namespace="test", runtime_env=runtime_env) @ray.remote def f(): import pip_install_test # noqa: F401 return True assert ray.get(f.remote()) ray.shutdown() # Give enough time for potentially uninstalling a conda env time.sleep(10) # Check nothing was GC'ed assert not check_local_files_gced(cluster)