Ejemplo n.º 1
0
def submit_scaling_job(num_tasks):
    @ray.remote(num_cpus=1)
    def f(i):
        time.sleep(60)
        return i

    print(">>>Submitting tasks with Ray client.")
    futures = [f.remote(i) for i in range(num_tasks)]

    print(">>>Verifying scale-up.")
    # Expect as many pods as tasks.
    # (each Ray pod has 1 CPU)
    wait_for_pods(num_tasks)

    print(">>>Waiting for task output.")
    task_output = ray.get(futures, timeout=360)

    assert task_output == list(range(num_tasks)), "Tasks did not"\
        "complete with expected output."
Ejemplo n.º 2
0
def submit_scaling_job(client_port, num_tasks):
    @ray.remote(num_cpus=1)
    def f(i):
        time.sleep(60)
        return i

    print(">>>Submitting tasks with Ray client.")
    ray.util.connect(f"127.0.0.1:{client_port}")
    futures = [f.remote(i) for i in range(num_tasks)]

    print(">>>Verifying scale-up.")
    # Operator pod plus number of tasks
    # (each Ray pod has 1 CPU).
    wait_for_pods(num_tasks + 1)

    print(">>>Waiting for task output.")
    task_output = ray.get(futures, timeout=360)

    assert task_output == list(range(num_tasks)), "Tasks did not"\
        "complete with expected output."
Ejemplo n.º 3
0
    def test_scaling(self):
        with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \
                tempfile.NamedTemporaryFile("w+") as example_cluster_file2, \
                tempfile.NamedTemporaryFile("w+") as operator_file:

            example_cluster_config_path = get_operator_config_path(
                "example_cluster.yaml")
            operator_config_path = get_operator_config_path(
                "operator_cluster_scoped.yaml")

            crd_path = get_operator_config_path("cluster_crd.yaml")

            operator_config = list(
                yaml.safe_load_all(open(operator_config_path).read()))
            example_cluster_config = yaml.safe_load(
                open(example_cluster_config_path).read())

            # Set image and pull policy
            podTypes = example_cluster_config["spec"]["podTypes"]
            pod_specs = [operator_config[-1]["spec"]["template"]["spec"]] + [
                podType["podConfig"]["spec"] for podType in podTypes
            ]
            for pod_spec in pod_specs:
                pod_spec["containers"][0]["image"] = IMAGE
                pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY

            # Config set-up for this test.
            example_cluster_config["spec"]["maxWorkers"] = 100
            example_cluster_config["spec"]["idleTimeoutMinutes"] = 1
            worker_type = podTypes[1]
            # Make sure we have the right type
            assert "worker" in worker_type["name"]
            worker_type["maxWorkers"] = 100
            # Key for the first part of this test:
            worker_type["minWorkers"] = 30

            # Config for a small cluster with the same name to be launched
            # in another namespace.
            example_cluster_config2 = copy.deepcopy(example_cluster_config)
            example_cluster_config2["spec"]["podTypes"][1]["minWorkers"] = 1

            # Test overriding default client port.
            example_cluster_config["spec"]["headServicePorts"] = [{
                "name":
                "client",
                "port":
                10002,
                "targetPort":
                10001
            }]

            yaml.dump(example_cluster_config, example_cluster_file)
            yaml.dump(example_cluster_config2, example_cluster_file2)
            yaml.dump_all(operator_config, operator_file)

            files = [example_cluster_file, operator_file]
            for file in files:
                file.flush()

            print(">>>Creating operator.")
            cmd = f"kubectl apply -f {operator_file.name}"
            subprocess.check_call(cmd, shell=True)

            # Test creating operator before CRD.
            print(">>>Waiting for Ray operator to enter running state.")
            wait_for_operator()

            print(">>>Creating RayCluster CRD.")
            cmd = f"kubectl apply -f {crd_path}"
            subprocess.check_call(cmd, shell=True)
            # Takes a bit of time for CRD to register.
            time.sleep(10)

            # Start a 30-pod cluster.
            print(">>>Starting a cluster.")
            cd = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
            subprocess.check_call(cd, shell=True)

            print(">>>Starting a cluster with same name in another namespace")
            # Assumes a namespace called {NAMESPACE}2 has been created.
            cd = f"kubectl -n {NAMESPACE}2 apply -f "\
                f"{example_cluster_file2.name}"
            subprocess.check_call(cd, shell=True)

            # Check that autoscaling respects minWorkers by waiting for
            # 32 pods in one namespace and 2 pods in the other.
            print(">>>Waiting for pods to join cluster.")
            wait_for_pods(31)
            wait_for_pods(2, namespace=f"{NAMESPACE}2")

            # Check scale-down.
            print(">>>Decreasing min workers to 0.")
            example_cluster_edit = copy.deepcopy(example_cluster_config)
            # Set minWorkers to 0:
            example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 0
            yaml.dump(example_cluster_edit, example_cluster_file)
            example_cluster_file.flush()
            cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
            subprocess.check_call(cm, shell=True)
            print(">>>Sleeping for a minute while workers time-out.")
            time.sleep(60)
            print(">>>Verifying scale-down.")
            wait_for_pods(1)

            with client_connect_to_k8s(port="10002"):
                # Test scale up and scale down after task submission.
                submit_scaling_job(num_tasks=15)

            print(">>>Sleeping for a minute while workers time-out.")
            time.sleep(60)
            print(">>>Verifying scale-down.")
            wait_for_pods(1)
Ejemplo n.º 4
0
    def test_scaling(self):
        with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \
                tempfile.NamedTemporaryFile("w+") as operator_file:

            example_cluster_config_path = get_operator_config_path(
                "example_cluster.yaml")
            operator_config_path = get_operator_config_path("operator.yaml")

            operator_config = list(
                yaml.safe_load_all(open(operator_config_path).read()))
            example_cluster_config = yaml.safe_load(
                open(example_cluster_config_path).read())

            # Set image and pull policy
            podTypes = example_cluster_config["spec"]["podTypes"]
            pod_specs = [operator_config[-1]["spec"]] + [
                podType["podConfig"]["spec"] for podType in podTypes
            ]
            for pod_spec in pod_specs:
                pod_spec["containers"][0]["image"] = IMAGE
                pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY

            # Config set-up for this test.
            example_cluster_config["spec"]["maxWorkers"] = 100
            example_cluster_config["spec"]["idleTimeoutMinutes"] = 1
            worker_type = podTypes[1]
            # Make sure we have the right type
            assert "worker" in worker_type["name"]
            worker_type["maxWorkers"] = 100
            # Key for the first part of this test:
            worker_type["minWorkers"] = 30

            yaml.dump(example_cluster_config, example_cluster_file)
            yaml.dump_all(operator_config, operator_file)

            files = [example_cluster_file, operator_file]
            for file in files:
                file.flush()

            # Start operator and a 30-pod-cluster.
            print(">>>Starting operator and a cluster.")
            for file in files:
                cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
                subprocess.check_call(cmd, shell=True)

            # Check that autoscaling respects minWorkers by waiting for
            # 32 pods in the namespace.
            print(">>>Waiting for pods to join cluster.")
            wait_for_pods(32)

            # Check scale-down.
            print(">>>Decreasing min workers to 0.")
            example_cluster_edit = copy.deepcopy(example_cluster_config)
            # Set minWorkers to 0:
            example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 0
            yaml.dump(example_cluster_edit, example_cluster_file)
            example_cluster_file.flush()
            cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
            subprocess.check_call(cm, shell=True)
            print(">>>Sleeping for a minute while workers time-out.")
            time.sleep(60)
            print(">>>Verifying scale-down.")
            wait_for_pods(2)

            # Test scale up and scale down after task submission.
            command = f"kubectl -n {NAMESPACE}"\
                " port-forward service/example-cluster-ray-head 10001:10001"
            command = command.split()
            print(">>>Port-forwarding head service.")
            self.proc = subprocess.Popen(command)
            try:
                # Wait a bit for the port-forwarding connection to be
                # established.
                time.sleep(10)
                # Check that job submission works
                submit_scaling_job(client_port="10001", num_tasks=15)
                # Clean up
                self.proc.kill()
            except Exception as e:
                # Clean up on failure
                self.proc.kill()
                raise (e)

            print(">>>Sleeping for a minute while workers time-out.")
            time.sleep(60)
            print(">>>Verifying scale-down.")
            wait_for_pods(2)