def test_detached_actor_gc( self, runtime_env_disable_URI_cache, start_cluster, field, spec_format, tmp_path ): """Tests that detached actor's conda env is GC'd only when it exits.""" cluster, address = start_cluster ray.init( address, namespace="test", runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path), ) @ray.remote class A: def test_import(self): import pip_install_test # noqa: F401 return True a = A.options(name="test", lifetime="detached").remote() ray.get(a.test_import.remote()) assert not check_local_files_gced(cluster) ray.shutdown() ray.init(address, namespace="test") assert not check_local_files_gced(cluster) a = ray.get_actor("test") assert ray.get(a.test_import.remote()) ray.kill(a) wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30)
def test_job_level_gc(start_cluster, field, spec_format, tmp_path): """Tests that job-level conda env is GC'd when the job exits.""" # We must use a single-node cluster. If we simulate a multi-node cluster # then the conda installs will proceed simultaneously, one on each node, # but since they're actually running on the same machine we get errors. cluster, address = start_cluster ray.init( address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path) ) @ray.remote def f(): import pip_install_test # noqa: F401 return True # Ensure that the runtime env has been installed. assert ray.get(f.remote()) assert not check_local_files_gced(cluster) ray.shutdown() wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30) # Check that we can reconnect with the same env. (In other words, ensure # the conda env was fully deleted and not left in some kind of corrupted # state that prevents reinstalling the same conda env.) ray.init( address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path) ) assert ray.get(f.remote())
def test_actor_level_gc(self, runtime_env_disable_URI_cache, start_cluster, field, spec_format, tmp_path): """Tests that actor-level working_dir is GC'd when the actor exits.""" cluster, address = start_cluster ray.init(address) runtime_env = generate_runtime_env_dict(field, spec_format, tmp_path) @ray.remote class A: def test_import(self): import pip_install_test # noqa: F401 return True NUM_ACTORS = 5 actors = [ A.options(runtime_env=runtime_env).remote() for _ in range(NUM_ACTORS) ] ray.get([a.test_import.remote() for a in actors]) for i in range(5): assert not check_local_files_gced(cluster) ray.kill(actors[i]) wait_for_condition(lambda: check_local_files_gced(cluster))
def test_actor_level_gc(self, start_cluster, working_dir_and_pymodules_disable_URI_cache, option: str): """Tests that actor-level working_dir is GC'd when the actor exits.""" NUM_NODES = 5 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") ray.init(address) @ray.remote(num_cpus=1) class A: def check(self): import test_module test_module.one() if option == "working_dir": A = A.options(runtime_env={"working_dir": S3_PACKAGE_URI}) else: A = A.options(runtime_env={"py_modules": [ S3_PACKAGE_URI, ]}) num_cpus = int(ray.available_resources()["CPU"]) actors = [A.remote() for _ in range(num_cpus)] ray.get([a.check.remote() for a in actors]) for i in range(num_cpus): assert not check_local_files_gced(cluster) ray.kill(actors[i]) wait_for_condition(lambda: check_local_files_gced(cluster))
def test_actor_level_gc( self, start_cluster, working_dir_and_pymodules_disable_URI_cache, disable_temporary_uri_pinning, option: str, ): """Tests that actor-level working_dir is GC'd when the actor exits.""" NUM_NODES = 5 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources" ) print(f'Added node with runtime_env_dir_name "node_{i}_runtime_resources".') print(f"Added all {NUM_NODES} nodes.") ray.init(address) print(f'Initialized Ray at address "{address}".') @ray.remote(num_cpus=1) class A: def check(self): import test_module test_module.one() if option == "working_dir": A = A.options(runtime_env={"working_dir": S3_PACKAGE_URI}) else: A = A.options( runtime_env={ "py_modules": [ S3_PACKAGE_URI, ] } ) print(f'Created deployment A with option "{option}".') num_cpus = int(ray.available_resources()["CPU"]) print(f"{num_cpus} cpus available.") actors = [A.remote() for _ in range(num_cpus)] print(f"Created {len(actors)} actors.") ray.get([a.check.remote() for a in actors]) print("Got responses from all actors.") for i in range(num_cpus): assert not check_local_files_gced(cluster) print(f"check_local_files_gced assertion passed for cpu {i}.") ray.kill(actors[i]) print(f"Issued ray.kill for actor {i}.") wait_for_condition(lambda: check_local_files_gced(cluster)) print("check_local_files_gced passed wait_for_condition block.")
def test_detached_actor_gc(self, start_cluster, runtime_env_disable_URI_cache, option: str, source: str): """Tests that URIs for detached actors are GC'd only when they exit.""" cluster, address = start_cluster if option == "working_dir": ray.init(address, namespace="test", runtime_env={"working_dir": source}) elif option == "py_modules": if source != S3_PACKAGE_URI: source = str(Path(source) / "test_module") ray.init(address, namespace="test", runtime_env={"py_modules": [source]}) # For a local directory, the package should be in the GCS. # For an S3 URI, there should be nothing in the GCS because # it will be downloaded from S3 directly on each node. if source == S3_PACKAGE_URI: assert check_internal_kv_gced() else: assert not check_internal_kv_gced() @ray.remote class A: def test_import(self): import test_module test_module.one() a = A.options(name="test", lifetime="detached").remote() ray.get(a.test_import.remote()) if source == S3_PACKAGE_URI: assert check_internal_kv_gced() else: assert not check_internal_kv_gced() assert not check_local_files_gced(cluster) ray.shutdown() ray.init(address, namespace="test") if source == S3_PACKAGE_URI: assert check_internal_kv_gced() else: assert not check_internal_kv_gced() assert not check_local_files_gced(cluster) a = ray.get_actor("test") ray.get(a.test_import.remote()) ray.kill(a) wait_for_condition(check_internal_kv_gced) wait_for_condition(lambda: check_local_files_gced(cluster))
def test_skip_local_gc_env_var( self, skip_local_gc, start_cluster, working_dir_and_pymodules_disable_URI_cache, disable_temporary_uri_pinning, source, ): cluster, address = start_cluster ray.init(address, namespace="test", runtime_env={"working_dir": source}) @ray.remote class A: def test_import(self): import test_module test_module.one() a = A.remote() ray.get(a.test_import.remote()) # Check working_dir was downloaded ray.shutdown() time.sleep(1) # Give time for GC to potentially happen assert not check_local_files_gced(cluster)
def test_default_large_cache(start_cluster, option: str, source: str): """Check small files aren't GC'ed when using the default large cache.""" NUM_NODES = 3 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node(num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") if option == "working_dir": ray.init(address, runtime_env={"working_dir": source}) elif option == "py_modules": if source != S3_PACKAGE_URI: source = str(Path(source) / "test_module") ray.init(address, runtime_env={"py_modules": [source]}) @ray.remote def f(): pass # Wait for runtime env to be set up. This can be accomplished by getting # the result of a task. ray.get(f.remote()) ray.shutdown() # If we immediately check that the files weren't GCed, it may spuriously # pass, so sleep first to give time for any deletions to happen. time.sleep(5) assert not check_local_files_gced(cluster) ray.init(address) @ray.remote(num_cpus=1) class A: def check(self): import test_module test_module.one() if option == "working_dir": A = A.options(runtime_env={"working_dir": S3_PACKAGE_URI}) else: A = A.options(runtime_env={"py_modules": [S3_PACKAGE_URI]}) _ = A.remote() ray.shutdown() time.sleep(5) assert not check_local_files_gced(cluster)
def test_job_level_gc(self, start_cluster, runtime_env_disable_URI_cache, option: str, source: str): """Tests that job-level working_dir is GC'd when the job exits.""" NUM_NODES = 3 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") if option == "working_dir": ray.init(address, runtime_env={"working_dir": source}) elif option == "py_modules": if source != S3_PACKAGE_URI: source = str(Path(source) / "test_module") ray.init(address, runtime_env={"py_modules": [source]}) # For a local directory, the package should be in the GCS. # For an S3 URI, there should be nothing in the GCS because # it will be downloaded from S3 directly on each node. if source == S3_PACKAGE_URI: assert check_internal_kv_gced() else: assert not check_internal_kv_gced() @ray.remote(num_cpus=1) class A: def test_import(self): import test_module test_module.one() num_cpus = int(ray.available_resources()["CPU"]) actors = [A.remote() for _ in range(num_cpus)] ray.get([a.test_import.remote() for a in actors]) if source == S3_PACKAGE_URI: assert check_internal_kv_gced() else: assert not check_internal_kv_gced() assert not check_local_files_gced(cluster) ray.shutdown() # Need to re-connect to use internal_kv. ray.init(address=address) wait_for_condition(check_internal_kv_gced) wait_for_condition(lambda: check_local_files_gced(cluster))
def test_task_level_gc(ray_start_cluster, field, spec_format, tmp_path): """Tests that task-level working_dir is GC'd when the task exits.""" cluster = ray_start_cluster soft_limit_zero = False system_config = cluster.list_all_nodes()[0]._ray_params._system_config if ("num_workers_soft_limit" in system_config and system_config["num_workers_soft_limit"] == 0): soft_limit_zero = True runtime_env = generate_runtime_env_dict(field, spec_format, tmp_path) @ray.remote def f(): import pip_install_test # noqa: F401 return True @ray.remote class A: def test_import(self): import pip_install_test # noqa: F401 return True # Start a task with runtime env ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a actor with runtime env actor = A.options(runtime_env=runtime_env).remote() ray.get(actor.test_import.remote()) # Local files should not be gced assert not check_local_files_gced(cluster) # Kill actor ray.kill(actor) if soft_limit_zero: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a task with runtime env ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster)
def test_skip_local_gc_env_var(self, skip_local_gc, start_cluster, field, tmp_path): cluster, address = start_cluster runtime_env = generate_runtime_env_dict(field, "python_object", tmp_path) ray.init(address, namespace="test", runtime_env=runtime_env) @ray.remote def f(): import pip_install_test # noqa: F401 return True assert ray.get(f.remote()) ray.shutdown() # Give enough time for potentially uninstalling a conda env time.sleep(10) # Check nothing was GC'ed assert not check_local_files_gced(cluster)
def test_job_level_gc( self, start_cluster, working_dir_and_pymodules_disable_URI_cache, disable_temporary_uri_pinning, option: str, source: str, ): """Tests that job-level working_dir is GC'd when the job exits.""" NUM_NODES = 3 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") print( f'Added node with runtime_env_dir_name "node_{i}_runtime_resources".' ) print(f"Added all {NUM_NODES} nodes.") if option == "working_dir": ray.init(address, runtime_env={"working_dir": source}) print("Initialized ray with working_dir runtime_env.") elif option == "py_modules": if source != S3_PACKAGE_URI: source = str(Path(source) / "test_module") ray.init( address, runtime_env={ "py_modules": [ source, Path(os.path.dirname(__file__)) / "pip_install_test-0.5-py3-none-any.whl", ] }, ) print("Initialized ray with py_modules runtime_env.") # For a local directory, the package should be in the GCS. # For an S3 URI, there should be nothing in the GCS because # it will be downloaded from S3 directly on each node. # In the "py_modules" case, we have specified a local wheel # file to be uploaded to the GCS, so we do not expect the # internal KV to be empty. if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() print( f'kv check 1 passed with source "{source}" and option "{option}".') @ray.remote(num_cpus=1) class A: def test_import(self): import test_module if option == "py_modules": import pip_install_test # noqa: F401 test_module.one() num_cpus = int(ray.available_resources()["CPU"]) print(f"{num_cpus} cpus available.") actors = [A.remote() for _ in range(num_cpus)] print(f"Created {len(actors)} actors.") ray.get([a.test_import.remote() for a in actors]) print("Got responses from all actors.") if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() print( f'kv check 2 passed with source "{source}" and option "{option}".') assert not check_local_files_gced(cluster) print("check_local_files_gced() check passed.") ray.shutdown() print("Ray has been shut down.") # Need to re-connect to use internal_kv. ray.init(address=address) print(f'Reconnected to Ray at address "{address}".') wait_for_condition(check_internal_kv_gced) print("check_internal_kv_gced passed wait_for_condition block.") wait_for_condition(lambda: check_local_files_gced(cluster)) print("check_local_files_gced passed wait_for_condition block.")
def test_detached_actor_gc( self, start_cluster, working_dir_and_pymodules_disable_URI_cache, disable_temporary_uri_pinning, option: str, source: str, ): """Tests that URIs for detached actors are GC'd only when they exit.""" cluster, address = start_cluster if option == "working_dir": ray.init(address, namespace="test", runtime_env={"working_dir": source}) elif option == "py_modules": if source != S3_PACKAGE_URI: source = str(Path(source) / "test_module") ray.init( address, namespace="test", runtime_env={ "py_modules": [ source, Path(os.path.dirname(__file__)) / "pip_install_test-0.5-py3-none-any.whl", ] }, ) print(f'Initialized Ray with option "{option}".') # For a local directory, the package should be in the GCS. # For an S3 URI, there should be nothing in the GCS because # it will be downloaded from S3 directly on each node. # In the "py_modules" case, a local wheel file will be in the GCS. if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() print( f'kv check 1 passed with source "{source}" and option "{option}".') @ray.remote class A: def test_import(self): import test_module if option == "py_modules": import pip_install_test # noqa: F401 test_module.one() a = A.options(name="test", lifetime="detached").remote() print('Created detached actor with name "test".') ray.get(a.test_import.remote()) print('Got response from "test" actor.') if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() print( f'kv check 2 passed with source "{source}" and option "{option}".') assert not check_local_files_gced(cluster) print("check_local_files_gced() check passed.") ray.shutdown() print("Ray has been shut down.") ray.init(address, namespace="test") print( f'Reconnected to Ray at address "{address}" and namespace "test".') if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() print( f'kv check 3 passed with source "{source}" and option "{option}".') assert not check_local_files_gced(cluster) print("check_local_files_gced() check passed.") a = ray.get_actor("test") print('Got "test" actor.') ray.get(a.test_import.remote()) print('Got response from "test" actor.') ray.kill(a) print('Issued ray.kill() request to "test" actor.') wait_for_condition(check_internal_kv_gced) print("check_internal_kv_gced passed wait_for_condition block.") wait_for_condition(lambda: check_local_files_gced(cluster)) print("check_local_files_gced passed wait_for_condition block.")
def test_task_level_gc(runtime_env_disable_URI_cache, ray_start_cluster, option): """Tests that task-level working_dir is GC'd when the worker exits.""" cluster = ray_start_cluster soft_limit_zero = False worker_register_timeout = False system_config = cluster.list_all_nodes()[0]._ray_params._system_config if ("num_workers_soft_limit" in system_config and system_config["num_workers_soft_limit"] == 0): soft_limit_zero = True if ("worker_register_timeout_seconds" in system_config and system_config["worker_register_timeout_seconds"] != 0): worker_register_timeout = True @ray.remote def f(): import test_module test_module.one() @ray.remote(num_cpus=1) class A: def check(self): import test_module test_module.one() if option == "working_dir": runtime_env = {"working_dir": S3_PACKAGE_URI} else: runtime_env = {"py_modules": [S3_PACKAGE_URI]} # Note: We should set a bigger timeout if downloads the s3 package slowly. get_timeout = 10 # Start a task with runtime env if worker_register_timeout: with pytest.raises(GetTimeoutError): ray.get(f.options(runtime_env=runtime_env).remote(), timeout=get_timeout) else: ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero or worker_register_timeout: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a actor with runtime env actor = A.options(runtime_env=runtime_env).remote() if worker_register_timeout: with pytest.raises(GetTimeoutError): ray.get(actor.check.remote(), timeout=get_timeout) # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: ray.get(actor.check.remote()) assert not check_local_files_gced(cluster) # Kill actor ray.kill(actor) if soft_limit_zero or worker_register_timeout: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster) # Start a task with runtime env if worker_register_timeout: with pytest.raises(GetTimeoutError): ray.get(f.options(runtime_env=runtime_env).remote(), timeout=get_timeout) else: ray.get(f.options(runtime_env=runtime_env).remote()) if soft_limit_zero or worker_register_timeout: # Wait for worker exited and local files gced wait_for_condition(lambda: check_local_files_gced(cluster)) else: # Local files should not be gced because of an enough soft limit. assert not check_local_files_gced(cluster)
def test_job_level_gc( self, start_cluster, working_dir_and_pymodules_disable_URI_cache, option: str, source: str, ): """Tests that job-level working_dir is GC'd when the job exits.""" NUM_NODES = 3 cluster, address = start_cluster for i in range(NUM_NODES - 1): # Head node already added. cluster.add_node( num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources") if option == "working_dir": ray.init(address, runtime_env={"working_dir": source}) elif option == "py_modules": if source != S3_PACKAGE_URI: source = str(Path(source) / "test_module") ray.init( address, runtime_env={ "py_modules": [ source, Path(os.path.dirname(__file__)) / "pip_install_test-0.5-py3-none-any.whl", ] }, ) # For a local directory, the package should be in the GCS. # For an S3 URI, there should be nothing in the GCS because # it will be downloaded from S3 directly on each node. # In the "py_modules" case, we have specified a local wheel # file to be uploaded to the GCS, so we do not expect the # internal KV to be empty. if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() @ray.remote(num_cpus=1) class A: def test_import(self): import test_module if option == "py_modules": import pip_install_test # noqa: F401 test_module.one() num_cpus = int(ray.available_resources()["CPU"]) actors = [A.remote() for _ in range(num_cpus)] ray.get([a.test_import.remote() for a in actors]) if source == S3_PACKAGE_URI and option != "py_modules": assert check_internal_kv_gced() else: assert not check_internal_kv_gced() assert not check_local_files_gced(cluster) ray.shutdown() # Need to re-connect to use internal_kv. ray.init(address=address) wait_for_condition(check_internal_kv_gced) wait_for_condition(lambda: check_local_files_gced(cluster))