def on_step_end(self, runner):
        if time.time() - self._start > self.deadline_s:
            logger.warning("Killing all trials - deadline hit.")
            for trial in runner.get_trials():
                self.stop_trial(trial)

            for i in range(20):
                if not ray.available_resources() == ray.cluster_resources():
                    print("Resources not released yet.")
                    time.sleep(5)
Example #2
0
def test_resource_tune(ray_connect_cluster, use_gpu):
    if use_gpu and ray.cluster_resources().get("GPU", 0) == 0:
        pytest.skip("No GPU available.")
    trainable_cls = DistributedTrainableCreator(_train_simple,
                                                num_slots=2,
                                                use_gpu=use_gpu)
    analysis = tune.run(trainable_cls,
                        num_samples=2,
                        stop={"training_iteration": 2})
    assert analysis.trials[0].last_result["training_iteration"] == 2
Example #3
0
def ray_init(LOCAL_MODE=False, **mainkwargs):
    available_cluster_cpus = 0
    available_cluster_gpus = 0
    DEBUG_MODE = mainkwargs['DEBUG_MODE']

    if is_predict_only(**mainkwargs):
        try:
            subprocess.run(["sudo", "pkill", "redis-server"])
            subprocess.run(["sudo", "pkill", "ray_RolloutWork"])
        except:
            print("ray process not running")
        LOCAL_MODE = DEBUG_MODE
        ray.init(local_mode=LOCAL_MODE,
                 num_cpus=max(1,
                              cpu_count() - 1),
                 num_gpus=1,
                 logging_level="ERROR")
        return available_cluster_cpus, available_cluster_gpus
    else:
        try:  # to init in the cluster
            ray.init(address=redis_add)
            ray_cluster_status_check(**mainkwargs)
            available_cluster_cpus = int(ray.cluster_resources().get("CPU"))
            available_cluster_gpus = int(ray.cluster_resources().get("GPU"))
            LOCAL_MODE = False
        except:  # try to init in your machine/isolated compute instance
            # Kill the redis-server. This seems the surest way to kill it
            subprocess.run(["sudo", "pkill", "redis-server"])
            subprocess.run(["sudo", "pkill", "ray_RolloutWork"])
            try:  # shutting down for a freash init assuming ray init process started when attempted to run in a cluster
                ray.shutdown()
            except:
                print("ray shutdown failed. Perhaps ray was not initialized ?")

            ray.init(local_mode=LOCAL_MODE)
        if not LOCAL_MODE:
            available_cluster_cpus = int(ray.available_resources().get("CPU"))
            if ray.available_resources().get("GPU") is not None:
                available_cluster_gpus = int(
                    ray.available_resources().get("GPU"))
            print("cluster_resources ", ray.cluster_resources(), "\n")
            print("available_resources ", ray.available_resources())
        return available_cluster_cpus, available_cluster_gpus
Example #4
0
def _get_cluster_cpus():
    """
    Get number of CPUs available on Ray cluster.

    Returns
    -------
    int
        Number of CPUs available on cluster.
    """
    return ray.cluster_resources().get("CPU", 1)
Example #5
0
    def wait_for_resources(resources: Dict[str, float], timeout: int = 60):
        """Wait until Ray cluster resources are available

        Args:
            resources: Minimum resources needed before
                this function returns.
            timeout: Timeout in seconds.

        """
        timeout = time.monotonic() + timeout

        available = ray.cluster_resources()
        while any(available.get(k, 0.0) < v for k, v in resources.items()):
            if time.monotonic() > timeout:
                raise ResourcesNotReadyError(
                    f"Timed out waiting for resources: {resources}"
                )
            time.sleep(1)
            available = ray.cluster_resources()
Example #6
0
def _checkParallel(num_jobs, parallel_backend):
    """
    Helper function to determine how many workers (jobs) should be submitted.
    If the parallelization backend is Python's multiprocessing ('mp') and num_jobs
    is either "auto" or None, then mp.cpu_count() will be used to determine the number
    of jobs. If num_jobs is not "auto" or None, then that number will be used instead.
    If the parallelization backend is ray, then the number of resources on the cluster will
    determine the number of workers, and num_jobs is ignored.

    Parameters
    ----------
    num_jobs : {None, "auto", int}
        Number of jobs to launch.
    parallel_backend : str
        Name of backend. Should be one of {'ray', 'mp'}.

    Returns
    -------
    enable_parallel : bool
        True if parallelization should be used (true for cases where num_jobs > 1).
    num_workers : int
        The number of workers to use.

    Raises
    ------
    ValueError : If parallel_backend is not one of {'ray', 'mp'}.
    """
    if isinstance(num_jobs, str) or (num_jobs is None) or (num_jobs > 1):

        # Check that pareallel_backend is one of the support types
        backends = ["ray", "mp"]
        if parallel_backend not in backends:
            err = ("parallel_backend should be one of {'ray', 'mp'}")
            raise ValueError(err)

        enable_parallel = True
        if parallel_backend == "ray":
            import ray
            if not num_jobs == "auto" or num_jobs is not None:
                logger.warning(
                    "This process is running with the ray parallelization backend: num_jobs parameter will be ignored."
                )

            num_workers = int(ray.cluster_resources()["CPU"])

        else:
            if num_jobs == "auto" or num_jobs is None:
                num_workers = mp.cpu_count()
            else:
                num_workers = num_jobs
    else:
        num_workers = 1
        enable_parallel = False

    return enable_parallel, num_workers
Example #7
0
def test_ray_resources_environment_variable(ray_start_cluster):
    address = ray_start_cluster.address

    os.environ["RAY_OVERRIDE_RESOURCES"] = "{\"custom1\":1, \"custom2\":2}"
    ray.init(address=address, resources={"custom1": 3, "custom3": 3})

    cluster_resources = ray.cluster_resources()
    print(cluster_resources)
    assert cluster_resources["custom1"] == 1
    assert cluster_resources["custom2"] == 2
    assert cluster_resources["custom3"] == 3
Example #8
0
def test_global_state_api(shutdown_only):

    ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1})

    assert ray.cluster_resources()["CPU"] == 5
    assert ray.cluster_resources()["GPU"] == 3
    assert ray.cluster_resources()["CustomResource"] == 1

    job_id = ray._private.utils.compute_job_id_from_driver(
        ray.WorkerID(ray.worker.global_worker.worker_id))

    client_table = ray.nodes()
    node_ip_address = ray.worker.global_worker.node_ip_address

    assert len(client_table) == 1
    assert client_table[0]["NodeManagerAddress"] == node_ip_address

    @ray.remote
    class Actor:
        def __init__(self):
            pass

    _ = Actor.options(name="test_actor").remote()  # noqa: F841
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.state.actors()
    assert len(actor_table) == 1

    actor_info, = actor_table.values()
    assert actor_info["JobID"] == job_id.hex()
    assert actor_info["Name"] == "test_actor"
    assert "IPAddress" in actor_info["Address"]
    assert "IPAddress" in actor_info["OwnerAddress"]
    assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"]

    job_table = ray.state.jobs()

    assert len(job_table) == 1
    assert job_table[0]["JobID"] == job_id.hex()
    assert job_table[0]["DriverIPAddress"] == node_ip_address
Example #9
0
def test_ray_init(monkeypatch, shutdown_only):
    def getpid(args):
        return os.getpid()

    def check_pool_size(pool, size):
        args = [tuple() for _ in range(size)]
        assert len(set(pool.map(getpid, args))) == size

    # Check that starting a pool starts ray if not initialized.
    pool = Pool(processes=2)
    assert ray.is_initialized()
    assert int(ray.cluster_resources()["CPU"]) == 2
    check_pool_size(pool, 2)
    pool.terminate()
    pool.join()
    ray.shutdown()

    # Set up the cluster id so that gcs is talking with a different
    # storage prefix
    monkeypatch.setenv("RAY_external_storage_namespace", "new_cluster")
    ray._raylet.Config.initialize("")

    # Check that starting a pool doesn't affect ray if there is a local
    # ray cluster running.
    ray.init(num_cpus=3)
    assert ray.is_initialized()
    pool = Pool(processes=2)
    assert int(ray.cluster_resources()["CPU"]) == 3
    check_pool_size(pool, 2)
    pool.terminate()
    pool.join()
    ray.shutdown()

    # Check that trying to start a pool on an existing ray cluster throws an
    # error if there aren't enough CPUs for the number of processes.
    ray.init(num_cpus=1)
    assert ray.is_initialized()
    with pytest.raises(ValueError):
        Pool(processes=2)
    assert int(ray.cluster_resources()["CPU"]) == 1
    ray.shutdown()
Example #10
0
    def get_usage(self) -> dict:
        """ Get CPU, GPU, Memory utilization metrics"""
        ray.init(address="auto", _redis_password="******")
        total_resources = ray.cluster_resources()
        available_resources = ray.available_resources()

        res_dict = {}
        for param in total_resources.keys():
            if "node" not in param:
                res_dict[
                    param] = f"{available_resources[param]}:{total_resources[param]}"
        return res_dict
Example #11
0
def run():
    ray.init(address='auto', _redis_password="")
    print('''This cluster consists of
        {} nodes in total
        {} CPU resources in total
    '''.format(len(ray.nodes()),
               ray.cluster_resources()['CPU']))
    lo = L.remote(1000)
    lo.warning.remote("Starting!")
    ray.get([run_worker.remote(i) for i in range(20)])
    # ray.get()
    lo.warning.remote("print from outside worker")
Example #12
0
def wait_for_nodes():
    # Wait for all nodes to join the cluster.
    while True:
        resources = ray.cluster_resources()
        node_keys = [key for key in resources if "node" in key]
        num_nodes = sum(resources[node_key] for node_key in node_keys)
        if num_nodes < NUM_WORKER:
            print("{} nodes have joined so far, waiting for {} more.".format(num_nodes, NUM_WORKER - num_nodes))
            sys.stdout.flush()
            time.sleep(1)
        else:
            break
Example #13
0
def get_trainer_kwargs(use_gpu=None):
    # Our goal is to have a worker per resource used for training.
    # The priority is GPUs, but can fall back to CPUs if there are no
    # GPUs available.
    if use_gpu is None:
        use_gpu = int(ray.cluster_resources().get("GPU", 0)) > 0

    if use_gpu:
        num_workers = int(ray.cluster_resources().get("GPU", 0))
    else:
        # TODO: use placement groups or otherwise spread across nodes
        node_resources = [node["Resources"] for node in ray.state.nodes()]
        num_workers = len(node_resources)

    return dict(
        # TODO travis: replace backend here once ray 1.8 released
        # backend='horovod',
        backend=HorovodConfig(),
        num_workers=num_workers,
        use_gpu=use_gpu,
    )
Example #14
0
def main():
    """The test simulates the workload with many threaded actors.

    Test is doing 4 things for 1 hour.

    - It first creates actors as many as num_cpus with max_concurrency=10
    - Each actor computes pi and put the result to the queue.
    - Driver keeps getting result & metadata from the actor.
    - Every X seconds, it kills all actors and restarts them.
    """
    ray.init(address="auto")
    args, unknown = parse_script_args()
    num_cpus = ray.cluster_resources()["CPU"]
    num_nodes = sum(1 for n in ray.nodes() if n["Alive"])
    print(f"Total number of actors: {num_cpus}, nodes: {num_nodes}")
    monitor_actor = monitor_memory_usage()

    start = time.time()
    while time.time() - start < args.test_runtime:
        # Step 1: Create actors and start computation loop.
        print("Create actors.")
        actors = start_actors(num_cpus, num_nodes)

        # Step 2: Get the pi result from actors.
        compute_start = time.time()
        print("Start computation.")
        while time.time() - compute_start < args.kill_interval_s:
            # Get the metadata.
            ray.get([actor.get_metadata.remote() for actor in actors])
            # Get the result.
            pb = ProgressBar("Computing Pi", num_cpus)
            results = [actor.get_pi.remote() for actor in actors]
            pb.fetch_until_complete(results)
            pb.close()

        # Step 3: Kill actors.
        print("Kill all actors.")
        for actor in actors:
            ray.kill(actor)

    # Report the result.
    print("PASSED.")
    used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
    print("Memory usage with failures.")
    print(f"Peak memory usage: {round(used_gb, 2)}GB")
    print(f"Peak memory usage per processes:\n {usage}")
    # Report the result.
    ray.get(monitor_actor.stop_run.remote())

    result = {"success": 0}
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:
        f.write(json.dumps(result))
Example #15
0
def init(args: Args):
    if not args.ray_address:
        ray.init(resources={"worker": os.cpu_count()})
    else:
        ray.init(address=args.ray_address)
    logging_utils.init()
    logging.info(args)
    os.makedirs(constants.WORK_DIR, exist_ok=True)
    resources = ray.cluster_resources()
    logging.info(resources)
    args.num_workers = resources["worker"]
    progress_tracker = tracing_utils.create_progress_tracker(args)
    return progress_tracker
Example #16
0
def test_counting_resources(start_connected_cluster):
    """Tests that Tune accounting is consistent with actual cluster."""

    cluster = start_connected_cluster
    nodes = []
    assert ray.cluster_resources()["CPU"] == 1
    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {"stopping_criterion": {"training_iteration": 10}}

    trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
    for t in trials:
        runner.add_trial(t)

    runner.step()
    running_trials = _get_running_trials(runner)
    assert len(running_trials) == 1
    assert _check_trial_running(running_trials[0])
    assert ray.available_resources().get("CPU", 0) == 0
    nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 2
    cluster.remove_node(nodes.pop())
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 1
    runner.step()
    # Only 1 trial can be running due to resource limitation.
    assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1

    for i in range(5):
        nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 6

    # This is to make sure that pg is ready for the previous pending trial,
    # so that when runner.step() is called next, the trial can be started in
    # the same event loop.
    time.sleep(5)
    runner.step()
    assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
def test_scaledown_shared_objects(shutdown_only):
    cluster = AutoscalingCluster(
        head_resources={"CPU": 0},
        worker_node_types={
            "cpu_node": {
                "resources": {
                    "CPU": 1,
                    "object_store_memory": 100 * 1024 * 1024,
                },
                "node_config": {},
                "min_workers": 0,
                "max_workers": 5,
            },
        },
        idle_timeout_minutes=0.05,
    )

    try:
        cluster.start(
            _system_config={"scheduler_report_pinned_bytes_only": True})
        ray.init("auto")

        actors = [Actor.remote() for _ in range(5)]
        ray.get([a.f.remote() for a in actors])
        print("All five nodes launched")

        # Verify scale-up.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 5)

        data = actors[0].create.remote(1024 * 1024 * 5)
        ray.get([a.recv.remote(data) for a in actors])
        print("Data broadcast successfully, deleting actors.")
        del actors

        # Verify scale-down.
        wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 1,
                           timeout=30)
    finally:
        cluster.shutdown()
Example #18
0
def test_replica_startup_status_transitions(ray_cluster):
    cluster = ray_cluster
    cluster.add_node(num_cpus=1)
    cluster.connect(namespace="serve")
    serve_instance = serve.start()

    signal = SignalActor.remote()

    @serve.deployment(version="1", ray_actor_options={"num_cpus": 2})
    class E:
        def __init__(self):
            ray.get(signal.wait.remote())

    E.deploy(_blocking=False)

    def get_replicas(replica_state):
        controller = serve_instance._controller
        replicas = ray.get(
            controller._dump_replica_states_for_testing.remote(E.name))
        return replicas.get([replica_state])

    # wait for serve to start the replica, and catch a reference to it.
    wait_for_condition(lambda: len(get_replicas(ReplicaState.STARTING)) > 0)
    replica = get_replicas(ReplicaState.STARTING)[0]

    # FIXME: We switched our code formatter from YAPF to Black. Check whether we still
    # need shorthands and update the comment below. See issue #21318.
    # declare shorthands as yapf doesn't like long lambdas
    PENDING_ALLOCATION = ReplicaStartupStatus.PENDING_ALLOCATION
    PENDING_INITIALIZATION = ReplicaStartupStatus.PENDING_INITIALIZATION
    SUCCEEDED = ReplicaStartupStatus.SUCCEEDED

    # currently there are no resources to allocate the replica
    assert replica.check_started() == PENDING_ALLOCATION

    # add the necessary resources to allocate the replica
    cluster.add_node(num_cpus=4)
    wait_for_condition(lambda: (ray.cluster_resources().get("CPU", 0) >= 4))
    wait_for_condition(lambda: (ray.available_resources().get("CPU", 0) >= 2))

    def is_replica_pending_initialization():
        status = replica.check_started()
        print(status)
        return status == PENDING_INITIALIZATION

    wait_for_condition(is_replica_pending_initialization, timeout=25)

    # send signal to complete replica intialization
    signal.send.remote()
    wait_for_condition(lambda: replica.check_started() == SUCCEEDED)
Example #19
0
    def connect(self, client: bool = True, timeout: int = 120, **init_kwargs):
        """Connect to the docker-compose Ray cluster.

        Assumes the cluster is at RAY_TESTHOST (defaults to
        ``127.0.0.1``).

        Args:
            client: If True, uses Ray client to connect to the
                cluster. If False, uses GCS to connect to the cluster.
            timeout: Connection timeout in seconds.
            **init_kwargs: kwargs to pass to ``ray.init()``.

        """
        host = os.environ.get("RAY_TESTHOST", "127.0.0.1")

        if client:
            port = self.client_port
            address = f"ray://{host}:{port}"
        else:
            port = self.gcs_port
            address = f"{host}:{port}"

        timeout_at = time.monotonic() + timeout
        while time.monotonic() < timeout_at:
            try:
                ray.init(address, **init_kwargs)
                self.wait_for_resources({"CPU": 1})
            except ResourcesNotReadyError:
                time.sleep(1)
                continue
            else:
                break

        try:
            ray.cluster_resources()
        except Exception as e:
            raise RuntimeError(f"Timed out connecting to Ray: {e}")
Example #20
0
def worker_test(ps, node_buffer, opt):
    agent = Actor(opt, job="test", buffer=ReplayBuffer)
    init_time = time.time()
    save_times = 0
    checkpoint_times = 0

    while True:
        weights = ray.get(ps.get_weights.remote())
        agent.set_weights(weights)
        start_actor_step, start_learner_step, _ = get_al_status(node_buffer)
        start_time = time.time()

        agent.run()

        last_actor_step, last_learner_step, _ = get_al_status(node_buffer)
        actor_step = np.sum(last_actor_step) - np.sum(start_actor_step)
        learner_step = np.sum(last_learner_step) - np.sum(start_learner_step)
        alratio = actor_step / (learner_step + 1)
        update_frequency = int(learner_step / (time.time() - start_time))
        total_learner_step = np.sum(last_learner_step)

        print("---------------------------------------------------")
        print("frame freq:", np.round((last_actor_step - start_actor_step) / (time.time() - start_time)))
        print("actor_steps:", np.sum(last_actor_step), "learner_step:", total_learner_step)
        print("actor leaner ratio: %.2f" % alratio)
        print("learner freq:", update_frequency)
        print("Ray total resources:", ray.cluster_resources())
        print("available resources:", ray.available_resources())
        print("---------------------------------------------------")

        total_time = time.time() - init_time

        if total_learner_step // opt.save_interval > save_times:
            with open(opt.save_dir + "/" + str(total_learner_step / 1e6) + "_weights.pickle", "wb") as pickle_out:
                pickle.dump(weights, pickle_out)
                print("****** Weights saved by time! ******")
            save_times = total_learner_step // opt.save_interval

        # save everything every checkpoint_freq s
        if total_time // opt.checkpoint_freq > checkpoint_times:
            print("save everything!")
            save_start_time = time.time()

            ps_save_op = [node_ps[i].save_weights.remote() for i in range(opt.num_nodes)]
            buffer_save_op = [node_buffer[node_index][model_type].save.remote() for model_type in model_types for node_index in range(opt.num_nodes)]
            ray.wait(buffer_save_op + ps_save_op, num_returns=opt.num_nodes * 6)       #5 models + ps

            print("total time for saving :", time.time() - save_start_time)
            checkpoint_times = total_time // opt.checkpoint_freq
Example #21
0
def test_ray_init(shutdown_only):
    def getpid(args):
        return os.getpid()

    def check_pool_size(pool, size):
        args = [tuple() for _ in range(size)]
        assert len(set(pool.map(getpid, args))) == size

    # Check that starting a pool starts ray if not initialized.
    pool = Pool(processes=2)
    assert ray.is_initialized()
    assert int(ray.cluster_resources()["CPU"]) == 2
    check_pool_size(pool, 2)
    pool.terminate()
    pool.join()
    ray.shutdown()

    # Check that starting a pool doesn't affect ray if there is a local
    # ray cluster running.
    ray.init(num_cpus=3)
    assert ray.is_initialized()
    pool = Pool(processes=2)
    assert int(ray.cluster_resources()["CPU"]) == 3
    check_pool_size(pool, 2)
    pool.terminate()
    pool.join()
    ray.shutdown()

    # Check that trying to start a pool on an existing ray cluster throws an
    # error if there aren't enough CPUs for the number of processes.
    ray.init(num_cpus=1)
    assert ray.is_initialized()
    with pytest.raises(ValueError):
        Pool(processes=2)
    assert int(ray.cluster_resources()["CPU"]) == 1
    ray.shutdown()
Example #22
0
def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub):
    p = error_pubsub
    # Check that we get warning messages for infeasible tasks.

    @ray.remote(num_gpus=1)
    def f():
        pass

    @ray.remote(resources={"Custom": 1})
    class Foo:
        pass

    # This task is infeasible.
    f.remote()
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # This actor placement task is infeasible.
    foo = Foo.remote()
    print(foo)
    errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR)
    assert len(errors) == 1
    assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR

    # Placement group cannot be made, but no warnings should occur.
    total_cpus = ray.cluster_resources()["CPU"]

    # Occupy one cpu by an actor
    @ray.remote(num_cpus=1)
    class A:
        pass

    a = A.remote()
    print(a)

    @ray.remote(num_cpus=total_cpus)
    def g():
        pass

    pg = placement_group([{"CPU": total_cpus}], strategy="STRICT_PACK")
    g.options(placement_group=pg).remote()

    errors = get_error_message(p,
                               1,
                               ray_constants.INFEASIBLE_TASK_ERROR,
                               timeout=5)
    assert len(errors) == 0, errors
Example #23
0
def test_legacy_spillback_distribution(ray_start_cluster):
    cluster = ray_start_cluster
    # Create a head node and wait until it is up.
    cluster.add_node(
        num_cpus=0,
        _system_config={
            "scheduler_spread_threshold": 0,
        },
    )
    ray.init(address=cluster.address)
    cluster.wait_for_nodes()

    num_nodes = 2
    # create 2 worker nodes.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=8)
    cluster.wait_for_nodes()

    assert ray.cluster_resources()["CPU"] == 16

    @ray.remote
    def task():
        time.sleep(1)
        return ray.worker.global_worker.current_node_id

    # Make sure tasks are spilled back non-deterministically.
    locations = ray.get([task.remote() for _ in range(8)])
    counter = collections.Counter(locations)
    spread = max(counter.values()) - min(counter.values())
    # Ideally we'd want 4 tasks to go to each node, but we'll settle for
    # anything better than a 1-7 split since randomness is noisy.
    assert spread < 7
    assert len(counter) > 1

    @ray.remote(num_cpus=1)
    class Actor1:
        def __init__(self):
            pass

        def get_location(self):
            return ray.worker.global_worker.current_node_id

    actors = [Actor1.remote() for _ in range(10)]
    locations = ray.get([actor.get_location.remote() for actor in actors])
    counter = collections.Counter(locations)
    spread = max(counter.values()) - min(counter.values())
    assert spread < 7
    assert len(counter) > 1
Example #24
0
def test_uses_resources(ray_start_regular):
    cluster_resources = ray.cluster_resources()

    @ray.remote
    def cpu_task():
        time.sleep(1)

    cpu_task.remote()
    resource_used = False

    while not resource_used:
        available_resources = ray.available_resources()
        resource_used = available_resources.get(
            "CPU", 0) == cluster_resources.get("CPU", 0) - 1

    assert resource_used
Example #25
0
def test_replenish_resources(ray_start_regular):
    cluster_resources = ray.cluster_resources()
    available_resources = ray.available_resources()
    assert cluster_resources == available_resources

    @ray.remote
    def cpu_task():
        pass

    ray.get(cpu_task.remote())
    resources_reset = False

    while not resources_reset:
        available_resources = ray.available_resources()
        resources_reset = (cluster_resources == available_resources)
    assert resources_reset
Example #26
0
    def get(self, id):
        runner = batches[id]
        runner.collect()

        done = len(runner.result)
        running = runner.total - done
        runtime, eta = runner.stats()

        return {
            'ncores': ray.cluster_resources()['CPU'],
            'trk_done': done,
            'trk_running': running,
            'started': str(runner.tstart),
            'runtime_seconds': runtime,
            'eta_seconds': eta
        }
Example #27
0
def init(config, mode):
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['OMP_NUM_THREADS'] = '1'

    if mode == 'local':
        ray.init(local_mode=True)
    elif mode == 'default':
        ray.init()
    elif mode == 'remote':
        ray.init(redis_address=config.HOST + ':6379')
        print('Cluster started with resources:')
        print(ray.cluster_resources())

    else:
        print('Invalid ray mode (local/default/remote)')
        exit(-1)
Example #28
0
def test_dynamic_res_creation(ray_start_regular):
    # This test creates a resource locally (without specifying the client_id)
    res_name = "test_res"
    res_capacity = 1.0

    @ray.remote
    def set_res(resource_name, resource_capacity):
        ray.experimental.set_resource(resource_name, resource_capacity)

    ray.get(set_res.remote(res_name, res_capacity))

    available_res = ray.available_resources()
    cluster_res = ray.cluster_resources()

    assert available_res[res_name] == res_capacity
    assert cluster_res[res_name] == res_capacity
Example #29
0
def test_dynamic_res_creation_stress(ray_start_cluster):
    # This stress tests creates many resources simultaneously on the same
    # client and then checks if the final state is consistent

    cluster = ray_start_cluster

    TIMEOUT = 5
    res_capacity = 1
    num_nodes = 5
    NUM_RES_TO_CREATE = 500

    for i in range(num_nodes):
        cluster.add_node()

    ray.init(redis_address=cluster.redis_address)

    clientids = [client["ClientID"] for client in ray.nodes()]
    target_clientid = clientids[1]

    @ray.remote
    def set_res(resource_name, resource_capacity, res_client_id):
        ray.experimental.set_resource(resource_name,
                                      resource_capacity,
                                      client_id=res_client_id)

    @ray.remote
    def delete_res(resource_name, res_client_id):
        ray.experimental.set_resource(resource_name,
                                      0,
                                      client_id=res_client_id)

    results = [
        set_res.remote(str(i), res_capacity, target_clientid)
        for i in range(0, NUM_RES_TO_CREATE)
    ]
    ray.get(results)

    success = False
    start_time = time.time()

    while time.time() - start_time < TIMEOUT and not success:
        resources = ray.cluster_resources()
        all_resources_created = []
        for i in range(0, NUM_RES_TO_CREATE):
            all_resources_created.append(str(i) in resources)
        success = all(all_resources_created)
    assert success
Example #30
0
def test_remove_node_before_result(start_connected_emptyhead_cluster):
    """Tune continues when node is removed before trial returns."""
    cluster = start_connected_emptyhead_cluster
    node = cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()

    runner = TrialRunner(BasicVariantGenerator())
    kwargs = {
        "stopping_criterion": {
            "training_iteration": 3
        },
        "checkpoint_freq": 2,
        "max_failures": 2
    }
    trial = Trial("__fake", **kwargs)
    runner.add_trial(trial)

    runner.step()  # Start trial, call _train once
    running_trials = _get_running_trials(runner)
    assert len(running_trials) == 1
    assert _check_trial_running(running_trials[0])
    assert not trial.last_result
    assert trial.status == Trial.RUNNING
    cluster.remove_node(node)
    cluster.add_node(num_cpus=1)
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 1

    # Process result: fetch data, invoke _train again
    runner.step()
    assert trial.last_result.get("training_iteration") == 1

    # Process result: discover failure, recover, _train (from scratch)
    runner.step()

    runner.step()  # Process result, invoke _train
    assert trial.last_result.get("training_iteration") == 1
    runner.step()  # Process result, invoke _save
    assert trial.last_result.get("training_iteration") == 2
    # process save, invoke _train
    runner.step()
    # process result
    runner.step()
    assert trial.status == Trial.TERMINATED

    with pytest.raises(TuneError):
        runner.step()