def test_detached_actor_gc(
        self, runtime_env_disable_URI_cache, start_cluster, field, spec_format, tmp_path
    ):
        """Tests that detached actor's conda env is GC'd only when it exits."""
        cluster, address = start_cluster

        ray.init(
            address,
            namespace="test",
            runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path),
        )

        @ray.remote
        class A:
            def test_import(self):
                import pip_install_test  # noqa: F401

                return True

        a = A.options(name="test", lifetime="detached").remote()
        ray.get(a.test_import.remote())

        assert not check_local_files_gced(cluster)

        ray.shutdown()
        ray.init(address, namespace="test")

        assert not check_local_files_gced(cluster)

        a = ray.get_actor("test")
        assert ray.get(a.test_import.remote())

        ray.kill(a)

        wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30)
def test_job_level_gc(start_cluster, field, spec_format, tmp_path):
    """Tests that job-level conda env is GC'd when the job exits."""
    # We must use a single-node cluster.  If we simulate a multi-node cluster
    # then the conda installs will proceed simultaneously, one on each node,
    # but since they're actually running on the same machine we get errors.
    cluster, address = start_cluster

    ray.init(
        address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path)
    )

    @ray.remote
    def f():
        import pip_install_test  # noqa: F401

        return True

    # Ensure that the runtime env has been installed.
    assert ray.get(f.remote())

    assert not check_local_files_gced(cluster)

    ray.shutdown()

    wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30)

    # Check that we can reconnect with the same env.  (In other words, ensure
    # the conda env was fully deleted and not left in some kind of corrupted
    # state that prevents reinstalling the same conda env.)

    ray.init(
        address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path)
    )

    assert ray.get(f.remote())
    def test_actor_level_gc(self, runtime_env_disable_URI_cache, start_cluster,
                            field, spec_format, tmp_path):
        """Tests that actor-level working_dir is GC'd when the actor exits."""
        cluster, address = start_cluster

        ray.init(address)

        runtime_env = generate_runtime_env_dict(field, spec_format, tmp_path)

        @ray.remote
        class A:
            def test_import(self):
                import pip_install_test  # noqa: F401

                return True

        NUM_ACTORS = 5
        actors = [
            A.options(runtime_env=runtime_env).remote()
            for _ in range(NUM_ACTORS)
        ]
        ray.get([a.test_import.remote() for a in actors])
        for i in range(5):
            assert not check_local_files_gced(cluster)
            ray.kill(actors[i])
        wait_for_condition(lambda: check_local_files_gced(cluster))
Beispiel #4
0
    def test_actor_level_gc(self, start_cluster,
                            working_dir_and_pymodules_disable_URI_cache,
                            option: str):
        """Tests that actor-level working_dir is GC'd when the actor exits."""
        NUM_NODES = 5
        cluster, address = start_cluster
        for i in range(NUM_NODES - 1):  # Head node already added.
            cluster.add_node(
                num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources")

        ray.init(address)

        @ray.remote(num_cpus=1)
        class A:
            def check(self):
                import test_module

                test_module.one()

        if option == "working_dir":
            A = A.options(runtime_env={"working_dir": S3_PACKAGE_URI})
        else:
            A = A.options(runtime_env={"py_modules": [
                S3_PACKAGE_URI,
            ]})

        num_cpus = int(ray.available_resources()["CPU"])
        actors = [A.remote() for _ in range(num_cpus)]
        ray.get([a.check.remote() for a in actors])
        for i in range(num_cpus):
            assert not check_local_files_gced(cluster)
            ray.kill(actors[i])
        wait_for_condition(lambda: check_local_files_gced(cluster))
Beispiel #5
0
    def test_actor_level_gc(
        self,
        start_cluster,
        working_dir_and_pymodules_disable_URI_cache,
        disable_temporary_uri_pinning,
        option: str,
    ):
        """Tests that actor-level working_dir is GC'd when the actor exits."""
        NUM_NODES = 5
        cluster, address = start_cluster
        for i in range(NUM_NODES - 1):  # Head node already added.
            cluster.add_node(
                num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources"
            )
            print(f'Added node with runtime_env_dir_name "node_{i}_runtime_resources".')

        print(f"Added all {NUM_NODES} nodes.")

        ray.init(address)
        print(f'Initialized Ray at address "{address}".')

        @ray.remote(num_cpus=1)
        class A:
            def check(self):
                import test_module

                test_module.one()

        if option == "working_dir":
            A = A.options(runtime_env={"working_dir": S3_PACKAGE_URI})
        else:
            A = A.options(
                runtime_env={
                    "py_modules": [
                        S3_PACKAGE_URI,
                    ]
                }
            )
        print(f'Created deployment A with option "{option}".')

        num_cpus = int(ray.available_resources()["CPU"])
        print(f"{num_cpus} cpus available.")

        actors = [A.remote() for _ in range(num_cpus)]
        print(f"Created {len(actors)} actors.")

        ray.get([a.check.remote() for a in actors])
        print("Got responses from all actors.")

        for i in range(num_cpus):
            assert not check_local_files_gced(cluster)
            print(f"check_local_files_gced assertion passed for cpu {i}.")

            ray.kill(actors[i])
            print(f"Issued ray.kill for actor {i}.")

        wait_for_condition(lambda: check_local_files_gced(cluster))
        print("check_local_files_gced passed wait_for_condition block.")
    def test_detached_actor_gc(self, start_cluster,
                               runtime_env_disable_URI_cache, option: str,
                               source: str):
        """Tests that URIs for detached actors are GC'd only when they exit."""
        cluster, address = start_cluster

        if option == "working_dir":
            ray.init(address,
                     namespace="test",
                     runtime_env={"working_dir": source})
        elif option == "py_modules":
            if source != S3_PACKAGE_URI:
                source = str(Path(source) / "test_module")
            ray.init(address,
                     namespace="test",
                     runtime_env={"py_modules": [source]})

        # For a local directory, the package should be in the GCS.
        # For an S3 URI, there should be nothing in the GCS because
        # it will be downloaded from S3 directly on each node.
        if source == S3_PACKAGE_URI:
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()

        @ray.remote
        class A:
            def test_import(self):
                import test_module

                test_module.one()

        a = A.options(name="test", lifetime="detached").remote()
        ray.get(a.test_import.remote())

        if source == S3_PACKAGE_URI:
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        assert not check_local_files_gced(cluster)

        ray.shutdown()

        ray.init(address, namespace="test")

        if source == S3_PACKAGE_URI:
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        assert not check_local_files_gced(cluster)

        a = ray.get_actor("test")
        ray.get(a.test_import.remote())

        ray.kill(a)
        wait_for_condition(check_internal_kv_gced)
        wait_for_condition(lambda: check_local_files_gced(cluster))
    def test_skip_local_gc_env_var(
        self,
        skip_local_gc,
        start_cluster,
        working_dir_and_pymodules_disable_URI_cache,
        disable_temporary_uri_pinning,
        source,
    ):
        cluster, address = start_cluster
        ray.init(address,
                 namespace="test",
                 runtime_env={"working_dir": source})

        @ray.remote
        class A:
            def test_import(self):
                import test_module

                test_module.one()

        a = A.remote()
        ray.get(a.test_import.remote())  # Check working_dir was downloaded

        ray.shutdown()

        time.sleep(1)  # Give time for GC to potentially happen
        assert not check_local_files_gced(cluster)
Beispiel #8
0
def test_default_large_cache(start_cluster, option: str, source: str):
    """Check small files aren't GC'ed when using the default large cache."""
    NUM_NODES = 3
    cluster, address = start_cluster
    for i in range(NUM_NODES - 1):  # Head node already added.
        cluster.add_node(num_cpus=1,
                         runtime_env_dir_name=f"node_{i}_runtime_resources")

    if option == "working_dir":
        ray.init(address, runtime_env={"working_dir": source})
    elif option == "py_modules":
        if source != S3_PACKAGE_URI:
            source = str(Path(source) / "test_module")
        ray.init(address, runtime_env={"py_modules": [source]})

    @ray.remote
    def f():
        pass

    # Wait for runtime env to be set up. This can be accomplished by getting
    # the result of a task.
    ray.get(f.remote())
    ray.shutdown()

    # If we immediately check that the files weren't GCed, it may spuriously
    # pass, so sleep first to give time for any deletions to happen.
    time.sleep(5)
    assert not check_local_files_gced(cluster)

    ray.init(address)

    @ray.remote(num_cpus=1)
    class A:
        def check(self):
            import test_module

            test_module.one()

    if option == "working_dir":
        A = A.options(runtime_env={"working_dir": S3_PACKAGE_URI})
    else:
        A = A.options(runtime_env={"py_modules": [S3_PACKAGE_URI]})

    _ = A.remote()
    ray.shutdown()
    time.sleep(5)
    assert not check_local_files_gced(cluster)
    def test_job_level_gc(self, start_cluster, runtime_env_disable_URI_cache,
                          option: str, source: str):
        """Tests that job-level working_dir is GC'd when the job exits."""
        NUM_NODES = 3
        cluster, address = start_cluster
        for i in range(NUM_NODES - 1):  # Head node already added.
            cluster.add_node(
                num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources")

        if option == "working_dir":
            ray.init(address, runtime_env={"working_dir": source})
        elif option == "py_modules":
            if source != S3_PACKAGE_URI:
                source = str(Path(source) / "test_module")
            ray.init(address, runtime_env={"py_modules": [source]})

        # For a local directory, the package should be in the GCS.
        # For an S3 URI, there should be nothing in the GCS because
        # it will be downloaded from S3 directly on each node.
        if source == S3_PACKAGE_URI:
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()

        @ray.remote(num_cpus=1)
        class A:
            def test_import(self):
                import test_module

                test_module.one()

        num_cpus = int(ray.available_resources()["CPU"])
        actors = [A.remote() for _ in range(num_cpus)]
        ray.get([a.test_import.remote() for a in actors])

        if source == S3_PACKAGE_URI:
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        assert not check_local_files_gced(cluster)

        ray.shutdown()

        # Need to re-connect to use internal_kv.
        ray.init(address=address)
        wait_for_condition(check_internal_kv_gced)
        wait_for_condition(lambda: check_local_files_gced(cluster))
def test_task_level_gc(ray_start_cluster, field, spec_format, tmp_path):
    """Tests that task-level working_dir is GC'd when the task exits."""

    cluster = ray_start_cluster

    soft_limit_zero = False
    system_config = cluster.list_all_nodes()[0]._ray_params._system_config
    if ("num_workers_soft_limit" in system_config
            and system_config["num_workers_soft_limit"] == 0):
        soft_limit_zero = True

    runtime_env = generate_runtime_env_dict(field, spec_format, tmp_path)

    @ray.remote
    def f():
        import pip_install_test  # noqa: F401

        return True

    @ray.remote
    class A:
        def test_import(self):
            import pip_install_test  # noqa: F401

            return True

    # Start a task with runtime env
    ray.get(f.options(runtime_env=runtime_env).remote())
    if soft_limit_zero:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)

    # Start a actor with runtime env
    actor = A.options(runtime_env=runtime_env).remote()
    ray.get(actor.test_import.remote())
    # Local files should not be gced
    assert not check_local_files_gced(cluster)

    # Kill actor
    ray.kill(actor)
    if soft_limit_zero:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)

    # Start a task with runtime env
    ray.get(f.options(runtime_env=runtime_env).remote())
    if soft_limit_zero:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)
Beispiel #11
0
    def test_skip_local_gc_env_var(self, skip_local_gc, start_cluster, field,
                                   tmp_path):
        cluster, address = start_cluster
        runtime_env = generate_runtime_env_dict(field, "python_object",
                                                tmp_path)
        ray.init(address, namespace="test", runtime_env=runtime_env)

        @ray.remote
        def f():
            import pip_install_test  # noqa: F401
            return True

        assert ray.get(f.remote())

        ray.shutdown()

        # Give enough time for potentially uninstalling a conda env
        time.sleep(10)

        # Check nothing was GC'ed
        assert not check_local_files_gced(cluster)
    def test_job_level_gc(
        self,
        start_cluster,
        working_dir_and_pymodules_disable_URI_cache,
        disable_temporary_uri_pinning,
        option: str,
        source: str,
    ):
        """Tests that job-level working_dir is GC'd when the job exits."""
        NUM_NODES = 3
        cluster, address = start_cluster
        for i in range(NUM_NODES - 1):  # Head node already added.
            cluster.add_node(
                num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources")
            print(
                f'Added node with runtime_env_dir_name "node_{i}_runtime_resources".'
            )

        print(f"Added all {NUM_NODES} nodes.")

        if option == "working_dir":
            ray.init(address, runtime_env={"working_dir": source})
            print("Initialized ray with working_dir runtime_env.")
        elif option == "py_modules":
            if source != S3_PACKAGE_URI:
                source = str(Path(source) / "test_module")
            ray.init(
                address,
                runtime_env={
                    "py_modules": [
                        source,
                        Path(os.path.dirname(__file__)) /
                        "pip_install_test-0.5-py3-none-any.whl",
                    ]
                },
            )
            print("Initialized ray with py_modules runtime_env.")

        # For a local directory, the package should be in the GCS.
        # For an S3 URI, there should be nothing in the GCS because
        # it will be downloaded from S3 directly on each node.
        # In the "py_modules" case, we have specified a local wheel
        # file to be uploaded to the GCS, so we do not expect the
        # internal KV to be empty.
        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()

        print(
            f'kv check 1 passed with source "{source}" and option "{option}".')

        @ray.remote(num_cpus=1)
        class A:
            def test_import(self):
                import test_module

                if option == "py_modules":
                    import pip_install_test  # noqa: F401
                test_module.one()

        num_cpus = int(ray.available_resources()["CPU"])
        print(f"{num_cpus} cpus available.")

        actors = [A.remote() for _ in range(num_cpus)]
        print(f"Created {len(actors)} actors.")

        ray.get([a.test_import.remote() for a in actors])
        print("Got responses from all actors.")

        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        print(
            f'kv check 2 passed with source "{source}" and option "{option}".')

        assert not check_local_files_gced(cluster)
        print("check_local_files_gced() check passed.")

        ray.shutdown()
        print("Ray has been shut down.")

        # Need to re-connect to use internal_kv.
        ray.init(address=address)
        print(f'Reconnected to Ray at address "{address}".')

        wait_for_condition(check_internal_kv_gced)
        print("check_internal_kv_gced passed wait_for_condition block.")

        wait_for_condition(lambda: check_local_files_gced(cluster))
        print("check_local_files_gced passed wait_for_condition block.")
    def test_detached_actor_gc(
        self,
        start_cluster,
        working_dir_and_pymodules_disable_URI_cache,
        disable_temporary_uri_pinning,
        option: str,
        source: str,
    ):
        """Tests that URIs for detached actors are GC'd only when they exit."""
        cluster, address = start_cluster

        if option == "working_dir":
            ray.init(address,
                     namespace="test",
                     runtime_env={"working_dir": source})
        elif option == "py_modules":
            if source != S3_PACKAGE_URI:
                source = str(Path(source) / "test_module")
            ray.init(
                address,
                namespace="test",
                runtime_env={
                    "py_modules": [
                        source,
                        Path(os.path.dirname(__file__)) /
                        "pip_install_test-0.5-py3-none-any.whl",
                    ]
                },
            )
        print(f'Initialized Ray with option "{option}".')

        # For a local directory, the package should be in the GCS.
        # For an S3 URI, there should be nothing in the GCS because
        # it will be downloaded from S3 directly on each node.
        # In the "py_modules" case, a local wheel file will be in the GCS.
        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        print(
            f'kv check 1 passed with source "{source}" and option "{option}".')

        @ray.remote
        class A:
            def test_import(self):
                import test_module

                if option == "py_modules":
                    import pip_install_test  # noqa: F401
                test_module.one()

        a = A.options(name="test", lifetime="detached").remote()
        print('Created detached actor with name "test".')

        ray.get(a.test_import.remote())
        print('Got response from "test" actor.')

        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        print(
            f'kv check 2 passed with source "{source}" and option "{option}".')

        assert not check_local_files_gced(cluster)
        print("check_local_files_gced() check passed.")

        ray.shutdown()
        print("Ray has been shut down.")

        ray.init(address, namespace="test")
        print(
            f'Reconnected to Ray at address "{address}" and namespace "test".')

        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        print(
            f'kv check 3 passed with source "{source}" and option "{option}".')

        assert not check_local_files_gced(cluster)
        print("check_local_files_gced() check passed.")

        a = ray.get_actor("test")
        print('Got "test" actor.')

        ray.get(a.test_import.remote())
        print('Got response from "test" actor.')

        ray.kill(a)
        print('Issued ray.kill() request to "test" actor.')

        wait_for_condition(check_internal_kv_gced)
        print("check_internal_kv_gced passed wait_for_condition block.")

        wait_for_condition(lambda: check_local_files_gced(cluster))
        print("check_local_files_gced passed wait_for_condition block.")
Beispiel #14
0
def test_task_level_gc(runtime_env_disable_URI_cache, ray_start_cluster,
                       option):
    """Tests that task-level working_dir is GC'd when the worker exits."""

    cluster = ray_start_cluster

    soft_limit_zero = False
    worker_register_timeout = False
    system_config = cluster.list_all_nodes()[0]._ray_params._system_config
    if ("num_workers_soft_limit" in system_config
            and system_config["num_workers_soft_limit"] == 0):
        soft_limit_zero = True
    if ("worker_register_timeout_seconds" in system_config
            and system_config["worker_register_timeout_seconds"] != 0):
        worker_register_timeout = True

    @ray.remote
    def f():
        import test_module

        test_module.one()

    @ray.remote(num_cpus=1)
    class A:
        def check(self):
            import test_module

            test_module.one()

    if option == "working_dir":
        runtime_env = {"working_dir": S3_PACKAGE_URI}
    else:
        runtime_env = {"py_modules": [S3_PACKAGE_URI]}

    # Note: We should set a bigger timeout if downloads the s3 package slowly.
    get_timeout = 10

    # Start a task with runtime env
    if worker_register_timeout:
        with pytest.raises(GetTimeoutError):
            ray.get(f.options(runtime_env=runtime_env).remote(),
                    timeout=get_timeout)
    else:
        ray.get(f.options(runtime_env=runtime_env).remote())
    if soft_limit_zero or worker_register_timeout:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)

    # Start a actor with runtime env
    actor = A.options(runtime_env=runtime_env).remote()
    if worker_register_timeout:
        with pytest.raises(GetTimeoutError):
            ray.get(actor.check.remote(), timeout=get_timeout)
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        ray.get(actor.check.remote())
        assert not check_local_files_gced(cluster)

    # Kill actor
    ray.kill(actor)
    if soft_limit_zero or worker_register_timeout:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)

    # Start a task with runtime env
    if worker_register_timeout:
        with pytest.raises(GetTimeoutError):
            ray.get(f.options(runtime_env=runtime_env).remote(),
                    timeout=get_timeout)
    else:
        ray.get(f.options(runtime_env=runtime_env).remote())
    if soft_limit_zero or worker_register_timeout:
        # Wait for worker exited and local files gced
        wait_for_condition(lambda: check_local_files_gced(cluster))
    else:
        # Local files should not be gced because of an enough soft limit.
        assert not check_local_files_gced(cluster)
Beispiel #15
0
    def test_job_level_gc(
        self,
        start_cluster,
        working_dir_and_pymodules_disable_URI_cache,
        option: str,
        source: str,
    ):
        """Tests that job-level working_dir is GC'd when the job exits."""
        NUM_NODES = 3
        cluster, address = start_cluster
        for i in range(NUM_NODES - 1):  # Head node already added.
            cluster.add_node(
                num_cpus=1, runtime_env_dir_name=f"node_{i}_runtime_resources")

        if option == "working_dir":
            ray.init(address, runtime_env={"working_dir": source})
        elif option == "py_modules":
            if source != S3_PACKAGE_URI:
                source = str(Path(source) / "test_module")
            ray.init(
                address,
                runtime_env={
                    "py_modules": [
                        source,
                        Path(os.path.dirname(__file__)) /
                        "pip_install_test-0.5-py3-none-any.whl",
                    ]
                },
            )

        # For a local directory, the package should be in the GCS.
        # For an S3 URI, there should be nothing in the GCS because
        # it will be downloaded from S3 directly on each node.
        # In the "py_modules" case, we have specified a local wheel
        # file to be uploaded to the GCS, so we do not expect the
        # internal KV to be empty.
        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()

        @ray.remote(num_cpus=1)
        class A:
            def test_import(self):
                import test_module

                if option == "py_modules":
                    import pip_install_test  # noqa: F401
                test_module.one()

        num_cpus = int(ray.available_resources()["CPU"])
        actors = [A.remote() for _ in range(num_cpus)]
        ray.get([a.test_import.remote() for a in actors])

        if source == S3_PACKAGE_URI and option != "py_modules":
            assert check_internal_kv_gced()
        else:
            assert not check_internal_kv_gced()
        assert not check_local_files_gced(cluster)

        ray.shutdown()

        # Need to re-connect to use internal_kv.
        ray.init(address=address)
        wait_for_condition(check_internal_kv_gced)
        wait_for_condition(lambda: check_local_files_gced(cluster))