Ejemplo n.º 1
0
def wait(object_ids, num_returns=1, timeout=None, worker=None):
    """Return a list of IDs that are ready and a list of IDs that are not.

    This method is identical to `ray.wait` except it adds support for tuples
    and ndarrays.

    Args:
        object_ids (List[ObjectID], Tuple(ObjectID), np.array(ObjectID)):
            List like of object IDs for objects that may or may not be ready.
            Note that these IDs must be unique.
        num_returns (int): The number of object IDs that should be returned.
        timeout (float): The maximum amount of time in seconds to wait before
            returning.

    Returns:
        A list of object IDs that are ready and a list of the remaining object
            IDs.
    """
    worker = ray.worker.global_worker if worker is None else worker
    if isinstance(object_ids, (tuple, np.ndarray)):
        return ray.wait(
            list(object_ids),
            num_returns=num_returns,
            timeout=timeout,
            worker=worker)

    return ray.wait(
        object_ids, num_returns=num_returns, timeout=timeout, worker=worker)
Ejemplo n.º 2
0
def collect_samples(agents, sample_batch_size, num_envs_per_worker,
                    train_batch_size):
    """Collects at least train_batch_size samples, never discarding any."""

    num_timesteps_so_far = 0
    trajectories = []
    agent_dict = {}

    for agent in agents:
        fut_sample = agent.sample.remote()
        agent_dict[fut_sample] = agent

    while agent_dict:
        [fut_sample], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(fut_sample)
        next_sample = ray.get(fut_sample)
        assert next_sample.count >= sample_batch_size * num_envs_per_worker
        num_timesteps_so_far += next_sample.count
        trajectories.append(next_sample)

        # Only launch more tasks if we don't already have enough pending
        pending = len(agent_dict) * sample_batch_size * num_envs_per_worker
        if num_timesteps_so_far + pending < train_batch_size:
            fut_sample2 = agent.sample.remote()
            agent_dict[fut_sample2] = agent

    return SampleBatch.concat_samples(trajectories)
Ejemplo n.º 3
0
def collect_samples_straggler_mitigation(agents, train_batch_size):
    """Collects at least train_batch_size samples.

    This is the legacy behavior as of 0.6, and launches extra sample tasks to
    potentially improve performance but can result in many wasted samples.
    """

    num_timesteps_so_far = 0
    trajectories = []
    agent_dict = {}

    for agent in agents:
        fut_sample = agent.sample.remote()
        agent_dict[fut_sample] = agent

    while num_timesteps_so_far < train_batch_size:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [fut_sample], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(fut_sample)
        # Start task with next trajectory and record it in the dictionary.
        fut_sample2 = agent.sample.remote()
        agent_dict[fut_sample2] = agent

        next_sample = ray.get(fut_sample)
        num_timesteps_so_far += next_sample.count
        trajectories.append(next_sample)

    logger.info("Discarding {} sample tasks".format(len(agent_dict)))
    return SampleBatch.concat_samples(trajectories)
Ejemplo n.º 4
0
def collect_samples(agents, config, local_evaluator):
    num_timesteps_so_far = 0
    trajectories = []
    # This variable maps the object IDs of trajectories that are currently
    # computed to the agent that they are computed on; we start some initial
    # tasks here.

    agent_dict = {}

    for agent in agents:
        fut_sample = agent.sample.remote()
        agent_dict[fut_sample] = agent

    while num_timesteps_so_far < config["timesteps_per_batch"]:
        # TODO(pcm): Make wait support arbitrary iterators and remove the
        # conversion to list here.
        [fut_sample], _ = ray.wait(list(agent_dict))
        agent = agent_dict.pop(fut_sample)
        # Start task with next trajectory and record it in the dictionary.
        fut_sample2 = agent.sample.remote()
        agent_dict[fut_sample2] = agent

        next_sample = ray.get(fut_sample)
        num_timesteps_so_far += next_sample.count
        trajectories.append(next_sample)
    return SampleBatch.concat_samples(trajectories)
Ejemplo n.º 5
0
    def as_future(self, object_id, check_ready=True):
        """Turn an object_id into a Future object.

        Args:
            object_id: A Ray's object_id.
            check_ready (bool): If true, check if the object_id is ready.

        Returns:
            PlasmaObjectFuture: A future object that waits the object_id.
        """
        if not isinstance(object_id, ray.ObjectID):
            raise TypeError("Input should be an ObjectID.")

        plain_object_id = plasma.ObjectID(object_id.id())
        fut = PlasmaObjectFuture(loop=self._loop, object_id=plain_object_id)

        if check_ready:
            ready, _ = ray.wait([object_id], timeout=0)
            if ready:
                if self._loop.get_debug():
                    logger.debug("%s has been ready.", plain_object_id)
                self._complete_future(fut)
                return fut

        if plain_object_id not in self._waiting_dict:
            linked_list = PlasmaObjectLinkedList(self._loop, plain_object_id)
            linked_list.add_done_callback(self._unregister_callback)
            self._waiting_dict[plain_object_id] = linked_list
        self._waiting_dict[plain_object_id].append(fut)
        if self._loop.get_debug():
            logger.debug("%s added to the waiting list.", fut)

        return fut
Ejemplo n.º 6
0
 def completed(self):
     pending = list(self._tasks)
     if pending:
         ready, _ = ray.wait(
             pending, num_returns=len(pending), timeout=0.01)
         for obj_id in ready:
             yield (self._tasks.pop(obj_id), self._objects.pop(obj_id))
Ejemplo n.º 7
0
    def poll(self):
        if self.pending is None:
            self.pending = {a.reset.remote(): a for a in self.actors}

        # each keyed by env_id in [0, num_remote_envs)
        obs, rewards, dones, infos = {}, {}, {}, {}
        ready = []

        # Wait for at least 1 env to be ready here
        while not ready:
            ready, _ = ray.wait(
                list(self.pending),
                num_returns=len(self.pending),
                timeout=self.timeout)

        # Get and return observations for each of the ready envs
        env_ids = set()
        for obj_id in ready:
            actor = self.pending.pop(obj_id)
            env_id = self.actors.index(actor)
            env_ids.add(env_id)
            ob, rew, done, info = ray.get(obj_id)
            obs[env_id] = ob
            rewards[env_id] = rew
            dones[env_id] = done
            infos[env_id] = info

        logger.debug("Got obs batch for actors {}".format(env_ids))
        return obs, rewards, dones, infos, {}
 def completed(self):
     pending = list(self._tasks)
     if pending:
         ready, _ = ray.wait(pending, num_returns=len(pending), timeout=self.timeout)
         if not ready:
             return []
         for obj_id in ready:
             yield (self._tasks.pop(obj_id), obj_id)
Ejemplo n.º 9
0
def test_dying_worker_get(shutdown_only):
    # Start the Ray processes.
    ray.init(num_cpus=2)

    @ray.remote
    def sleep_forever():
        time.sleep(10**6)

    @ray.remote
    def get_worker_pid():
        return os.getpid()

    x_id = sleep_forever.remote()
    time.sleep(0.01)  # Try to wait for the sleep task to get scheduled.
    # Get the PID of the other worker.
    worker_pid = ray.get(get_worker_pid.remote())

    @ray.remote
    def f(id_in_a_list):
        ray.get(id_in_a_list[0])

    # Have the worker wait in a get call.
    result_id = f.remote([x_id])
    time.sleep(1)

    # Make sure the task hasn't finished.
    ready_ids, _ = ray.wait([result_id], timeout=0)
    assert len(ready_ids) == 0

    # Kill the worker.
    os.kill(worker_pid, signal.SIGKILL)
    time.sleep(0.1)

    # Make sure the sleep task hasn't finished.
    ready_ids, _ = ray.wait([x_id], timeout=0)
    assert len(ready_ids) == 0
    # Seal the object so the store attempts to notify the worker that the
    # get has been fulfilled.
    ray.worker.global_worker.put_object(x_id, 1)
    time.sleep(0.1)

    # Make sure that nothing has died.
    assert ray.services.all_processes_alive()
Ejemplo n.º 10
0
def test_wait(ray_start_combination):
    num_nodes, num_workers_per_scheduler, cluster = ray_start_combination
    num_workers = num_nodes * num_workers_per_scheduler

    @ray.remote
    def f(x):
        return x

    x_ids = [f.remote(i) for i in range(100)]
    for i in range(len(x_ids)):
        ray.wait([x_ids[i]])
    for i in range(len(x_ids) - 1):
        ray.wait(x_ids[i:])

    @ray.remote
    def g(x):
        time.sleep(x)

    for i in range(1, 5):
        x_ids = [
            g.remote(np.random.uniform(0, i)) for _ in range(2 * num_workers)
        ]
        ray.wait(x_ids, num_returns=len(x_ids))

    assert cluster.remaining_processes_alive()
Ejemplo n.º 11
0
  def testWait(self):
    ray.init(start_ray_local=True, num_workers=1)

    @ray.remote
    def f(delay):
      time.sleep(delay)
      return 1

    objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
    ready_ids, remaining_ids = ray.wait(objectids)
    self.assertTrue(len(ready_ids) == 1)
    self.assertTrue(len(remaining_ids) == 3)
    ready_ids, remaining_ids = ray.wait(objectids, num_returns=4)
    self.assertEqual(ready_ids, objectids)
    self.assertEqual(remaining_ids, [])

    objectids = [f.remote(0.5), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
    start_time = time.time()
    ready_ids, remaining_ids = ray.wait(objectids, timeout=1.75, num_returns=4)
    self.assertTrue(time.time() - start_time < 2)
    self.assertEqual(len(ready_ids), 3)
    self.assertEqual(len(remaining_ids), 1)
    ray.wait(objectids)
    objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
    start_time = time.time()
    ready_ids, remaining_ids = ray.wait(objectids, timeout=5)
    self.assertTrue(time.time() - start_time < 5)
    self.assertEqual(len(ready_ids), 1)
    self.assertEqual(len(remaining_ids), 3)

    ray.worker.cleanup()
Ejemplo n.º 12
0
    def testWait(self):
        for num_local_schedulers in [1, 4]:
            for num_workers_per_scheduler in [4]:
                num_workers = num_local_schedulers * num_workers_per_scheduler
                ray.worker._init(start_ray_local=True, num_workers=num_workers,
                                 num_local_schedulers=num_local_schedulers,
                                 num_cpus=100)

                @ray.remote
                def f(x):
                    return x

                x_ids = [f.remote(i) for i in range(100)]
                for i in range(len(x_ids)):
                    ray.wait([x_ids[i]])
                for i in range(len(x_ids) - 1):
                    ray.wait(x_ids[i:])

                @ray.remote
                def g(x):
                    time.sleep(x)

                for i in range(1, 5):
                    x_ids = [g.remote(np.random.uniform(0, i))
                             for _ in range(2 * num_workers)]
                    ray.wait(x_ids, num_returns=len(x_ids))

                self.assertTrue(ray.services.all_processes_alive())
                ray.worker.cleanup()
Ejemplo n.º 13
0
def get_ray_results(pending_ids, ray_id_to_config):
    '''Helper to wait and get ray results into a new trial_data_dict, or handle ray error'''
    trial_data_dict = {}
    for _t in range(len(pending_ids)):
        ready_ids, pending_ids = ray.wait(pending_ids, num_returns=1)
        ready_id = ready_ids[0]
        try:
            trial_data = ray.get(ready_id)
            trial_index = trial_data.pop('trial_index')
            trial_data_dict[trial_index] = trial_data
        except:
            logger.exception(f'Trial failed: {ray_id_to_config[ready_id]}')
    return trial_data_dict
Ejemplo n.º 14
0
def test_actor_worker_dying(ray_start_regular):
    @ray.remote
    class Actor(object):
        def kill(self):
            eval("exit()")

    @ray.remote
    def consume(x):
        pass

    a = Actor.remote()
    [obj], _ = ray.wait([a.kill.remote()], timeout=5.0)
    with pytest.raises(Exception):
        ray.get(obj)
    with pytest.raises(Exception):
        ray.get(consume.remote(obj))
    wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
Ejemplo n.º 15
0
    def sample(self):
        if self._remote_path is None:
            policy_params = self.policy.get_param_values()
            self._remote_path = self._remote_environment.rollout.remote(
                policy_params, self._max_path_length)

        path_ready, _ = ray.wait([self._remote_path], timeout=0)

        if len(path_ready) or not self.batch_ready():
            path = ray.get(self._remote_path)
            self.pool.add_path(path)
            self._remote_path = None
            self._total_samples += len(path['observations'])
            self._last_path_return = np.sum(path['rewards'])
            self._max_path_return = max(self._max_path_return,
                                        self._last_path_return)
            self._n_episodes += 1
Ejemplo n.º 16
0
    def testActorWorkerDying(self):
        ray.init(num_workers=0, driver_mode=ray.SILENT_MODE)

        @ray.remote
        class Actor(object):
            def kill(self):
                eval("exit()")

        @ray.remote
        def consume(x):
            pass

        a = Actor.remote()
        [obj], _ = ray.wait([a.kill.remote()], timeout=5000)
        self.assertRaises(Exception, lambda: ray.get(obj))
        self.assertRaises(Exception, lambda: ray.get(consume.remote(obj)))
        wait_for_errors(b"worker_died", 1)
Ejemplo n.º 17
0
def collect_episodes(local_evaluator,
                     remote_evaluators=[],
                     timeout_seconds=180):
    """Gathers new episodes metrics tuples from the given evaluators."""

    pending = [
        a.apply.remote(lambda ev: ev.get_metrics()) for a in remote_evaluators
    ]
    collected, _ = ray.wait(
        pending, num_returns=len(pending), timeout=timeout_seconds * 1.0)
    num_metric_batches_dropped = len(pending) - len(collected)

    metric_lists = ray.get(collected)
    metric_lists.append(local_evaluator.get_metrics())
    episodes = []
    for metrics in metric_lists:
        episodes.extend(metrics)
    return episodes, num_metric_batches_dropped
Ejemplo n.º 18
0
    def stop(self, error=False, error_msg=None, stop_logger=True):
        """Stops this trial.

        Stops this trial, releasing all allocating resources. If stopping the
        trial fails, the run will be marked as terminated in error, but no
        exception will be thrown.

        Args:
            error (bool): Whether to mark this trial as terminated in error.
            error_msg (str): Optional error message.
            stop_logger (bool): Whether to shut down the trial logger.
        """

        if error:
            self.status = Trial.ERROR
        else:
            self.status = Trial.TERMINATED

        try:
            if error_msg and self.logdir:
                self.num_failures += 1
                error_file = os.path.join(
                    self.logdir, "error_{}.txt".format(date_str()))
                with open(error_file, "w") as f:
                    f.write(error_msg)
                self.error_file = error_file
            if self.runner:
                stop_tasks = []
                stop_tasks.append(self.runner.stop.remote())
                stop_tasks.append(self.runner.__ray_terminate__.remote(
                    self.runner._ray_actor_id.id()))
                # TODO(ekl)  seems like wait hangs when killing actors
                _, unfinished = ray.wait(
                        stop_tasks, num_returns=2, timeout=250)
        except Exception:
            print("Error stopping runner:", traceback.format_exc())
            self.status = Trial.ERROR
        finally:
            self.runner = None

        if stop_logger and self.result_logger:
            self.result_logger.close()
            self.result_logger = None
Ejemplo n.º 19
0
def test_actor_creation_node_failure(ray_start_cluster):
    # TODO(swang): Refactor test_raylet_failed, etc to reuse the below code.
    cluster = ray_start_cluster

    @ray.remote
    class Child(object):
        def __init__(self, death_probability):
            self.death_probability = death_probability

        def ping(self):
            # Exit process with some probability.
            exit_chance = np.random.rand()
            if exit_chance < self.death_probability:
                sys.exit(-1)

    num_children = 100
    # Children actors will die about half the time.
    death_probability = 0.5

    children = [Child.remote(death_probability) for _ in range(num_children)]
    while len(cluster.list_all_nodes()) > 1:
        for j in range(3):
            # Submit some tasks on the actors. About half of the actors will
            # fail.
            children_out = [child.ping.remote() for child in children]
            # Wait a while for all the tasks to complete. This should trigger
            # reconstruction for any actor creation tasks that were forwarded
            # to nodes that then failed.
            ready, _ = ray.wait(
                children_out, num_returns=len(children_out), timeout=5 * 60.0)
            assert len(ready) == len(children_out)

            # Replace any actors that died.
            for i, out in enumerate(children_out):
                try:
                    ray.get(out)
                except ray.worker.RayTaskError:
                    children[i] = Child.remote(death_probability)
        # Remove a node. Any actor creation tasks that were forwarded to this
        # node must be reconstructed.
        cluster.remove_node(cluster.list_all_nodes()[-1])
Ejemplo n.º 20
0
    def _process_events(self):
        [result_id], _ = ray.wait(list(self._running))
        trial = self._running.pop(result_id)
        try:
            result = ray.get(result_id)
            self._total_time += result.time_this_iter_s

            if trial.should_stop(result):
                self._scheduler_alg.on_trial_complete(self, trial, result)
                decision = TrialScheduler.STOP
            else:
                decision = self._scheduler_alg.on_trial_result(
                    self, trial, result)
            trial.update_last_result(
                result, terminate=(decision == TrialScheduler.STOP))

            if decision == TrialScheduler.CONTINUE:
                if trial.should_checkpoint():
                    # TODO(rliaw): This is a blocking call
                    trial.checkpoint()
                self._running[trial.train_remote()] = trial
            elif decision == TrialScheduler.PAUSE:
                self._pause_trial(trial)
            elif decision == TrialScheduler.STOP:
                self._stop_trial(trial)
            else:
                assert False, "Invalid scheduling decision: {}".format(
                    decision)
        except Exception:
            error_msg = traceback.format_exc()
            print("Error processing event:", error_msg)
            if trial.status == Trial.RUNNING:
                if trial.has_checkpoint() and \
                        trial.num_failures < trial.max_failures:
                    self._try_recover(trial, error_msg)
                else:
                    self._scheduler_alg.on_trial_error(self, trial)
                    self._stop_trial(trial, error=True, error_msg=error_msg)
Ejemplo n.º 21
0
    def _stop_trial(self, trial, error=False, error_msg=None,
                    stop_logger=True):
        """Stops this trial.

        Stops this trial, releasing all allocating resources. If stopping the
        trial fails, the run will be marked as terminated in error, but no
        exception will be thrown.

        Args:
            error (bool): Whether to mark this trial as terminated in error.
            error_msg (str): Optional error message.
            stop_logger (bool): Whether to shut down the trial logger.
        """

        if stop_logger:
            trial.close_logger()

        if error:
            self.set_status(trial, Trial.ERROR)
        else:
            self.set_status(trial, Trial.TERMINATED)

        try:
            trial.write_error_log(error_msg)
            if hasattr(trial, 'runner') and trial.runner:
                stop_tasks = []
                stop_tasks.append(trial.runner.stop.remote())
                stop_tasks.append(trial.runner.__ray_terminate__.remote())
                # TODO(ekl)  seems like wait hangs when killing actors
                _, unfinished = ray.wait(
                    stop_tasks, num_returns=2, timeout=0.25)
        except Exception:
            logger.exception("Error stopping runner.")
            self.set_status(trial, Trial.ERROR)
        finally:
            trial.runner = None
Ejemplo n.º 22
0
def test_dying_driver_wait(shutdown_only):
    # Start the Ray processes.
    address_info = ray.init(num_cpus=1)

    @ray.remote
    def sleep_forever():
        time.sleep(10**6)

    x_id = sleep_forever.remote()

    driver = """
import ray
ray.init("{}")
ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))])
""".format(address_info["redis_address"], x_id.hex())

    p = run_string_as_driver_nonblocking(driver)
    # Make sure the driver is running.
    time.sleep(1)
    assert p.poll() is None

    # Kill the driver process.
    p.kill()
    p.wait()
    time.sleep(0.1)

    # Make sure the original task hasn't finished.
    ready_ids, _ = ray.wait([x_id], timeout=0)
    assert len(ready_ids) == 0
    # Seal the object so the store attempts to notify the worker that the
    # wait can return.
    ray.worker.global_worker.put_object(x_id, 1)
    time.sleep(0.1)

    # Make sure that nothing has died.
    assert ray.services.all_processes_alive()
Ejemplo n.º 23
0
def main(n_runners=6, n_repeats=5, n_episodes=2000, save=True, show=False):
    # Number of physical cores on machine
    num_cpus = psutil.cpu_count(logical=False)

    # Start ray
    ray.init(logging_level="ERROR")

    N_EPISODES = n_episodes
    N_REPEATS = 5

    # Initialize workers
    env = ContinuousGridWorld()
    AGENT = DQNAgent_central(env)
    TARGET_UPDATE_INTERVAL = 30
    EP_LENGTH = 50
    steps_per_cycle = 50

    runners = [
        CentralizedRunner.remote(ContinuousGridWorld(), i)
        for i in range(num_runners)
    ]

    obj_ids = [
        runner.get_experience.remote(dict(AGENT.model.named_parameters()),
                                     EP_LENGTH) for runner in runners
    ]
    REWARDS = [[] for a in range(N_REPEATS)]
    for repeat_id in range(N_REPEATS):
        AGENT.reset_model()
        for i in tqdm(range(1, N_EPISODES + 1)):
            for _ in range(n_runners):
                ready, not_ready = ray.wait(obj_ids, timeout=1)
                if len(ready) == 0: continue
                tmp_reward = 0
                avg_n_steps = 0
                for r in ready:
                    batch, reward, actor_id = ray.get(r)
                    AGENT.add_batch(batch)
                    eps = max(0.5 * (1 - i / N_EPISODES), 0)
                    not_ready.append(runners[actor_id].get_experience.remote(
                        dict(AGENT.model.named_parameters()), EP_LENGTH, eps))
                    tmp_reward += reward
                    avg_n_steps += len(batch["R"])
                    obj_ids = not_ready
                    continue
            REWARDS[repeat_id].append(tmp_reward / len(ready))
            # avg_n_steps /= len(ready)
            for _ in range(int(avg_n_steps)):
                AGENT.optimize()
            if i % TARGET_UPDATE_INTERVAL == 0:
                AGENT.update_target()

    if show:
        plt.figure()
        for r in range(N_REPEATS):
            plt.plot(np.convolve(REWARDS[r], np.ones(100) / 100.,
                                 mode='valid'))
        plt.show()
    if save:
        max_length = max([len(a) for a in REWARDS])
        reward_array = np.zeros((N_REPEATS, max_length))
        for r in range(N_REPEATS):
            reward_array[r] = REWARDS[r][:max_length]
        np.save(
            f"results/baseline/centralized_experience_{n_runners}runners.npy",
            reward_array)
Ejemplo n.º 24
0
    # Launch some initial experiments.
    for _ in range(args.num_starting_segments):
        hyperparameters = generate_hyperparameters()
        experiment_id = objective.train_cnn_and_compute_accuracy.remote(
            hyperparameters, steps, train_images, train_labels,
            validation_images, validation_labels)
        experiment_info[experiment_id] = {
            "hyperparameters": hyperparameters,
            "total_num_steps": steps,
            "accuracies": []
        }
        remaining_ids.append(experiment_id)

    for _ in range(args.num_segments):
        # Wait for a segment of an experiment to finish.
        ready_ids, remaining_ids = ray.wait(remaining_ids, num_returns=1)
        experiment_id = ready_ids[0]
        # Get the accuracy and the weights.
        accuracy, weights = ray.get(experiment_id)
        # Update the experiment info.
        previous_info = experiment_info[experiment_id]
        previous_info["accuracies"].append(accuracy)

        # Update the best accuracy and best hyperparameters.
        if accuracy > best_accuracy:
            best_hyperparameters = hyperparameters
            best_accuracy = accuracy

        if is_promising(previous_info):
            # If the experiment still looks promising, then continue running it.
            print("Continuing to run the experiment with hyperparameters {}.".
Ejemplo n.º 25
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: Iterable[Block[T]]) -> Iterable[ObjectRef[Block]]:

        map_bar = ProgressBar("Map Progress", total=len(blocks))

        class Worker:
            def ready(self):
                return "ok"

            @ray.method(num_returns=2)
            def process_block(
                    self, block: Block[T],
                    meta: BlockMetadata) -> (Block[U], BlockMetadata):
                new_block = fn(block)
                new_metadata = BlockMetadata(num_rows=new_block.num_rows(),
                                             size_bytes=new_block.size_bytes(),
                                             schema=new_block.schema(),
                                             input_files=meta.input_files)
                return new_block, new_metadata

        if "num_cpus" not in remote_args:
            remote_args["num_cpus"] = 1
        Worker = ray.remote(**remote_args)(Worker)

        workers = [Worker.remote()]
        metadata_mapping = {}
        tasks = {w.ready.remote(): w for w in workers}
        ready_workers = set()
        blocks_in = [(b, m) for (b, m) in zip(blocks, blocks.get_metadata())]
        blocks_out = []

        while len(blocks_out) < len(blocks):
            ready, _ = ray.wait(list(tasks),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if len(ready_workers) / len(workers) > 0.8:
                    w = Worker.remote()
                    workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks[obj_id]
            del tasks[obj_id]

            # Process task result.
            if worker in ready_workers:
                blocks_out.append(obj_id)
                map_bar.update(1)
            else:
                ready_workers.add(worker)

            # Schedule a new task.
            if blocks_in:
                block_ref, meta_ref = worker.process_block.remote(
                    *blocks_in.pop())
                metadata_mapping[block_ref] = meta_ref
                tasks[block_ref] = worker

        new_metadata = ray.get([metadata_mapping[b] for b in blocks_out])
        map_bar.close()
        return BlockList(blocks_out, new_metadata)
 def reset(self):
     obj_ids = [actor.reset.remote() for actor in self.actors]
     results = ray.get(ray.wait(obj_ids, num_returns=self.num_envs)[0])
     # TODO: should update self.results, but it's ok because this function will be invoked only at first
     return np.stack(results)
Ejemplo n.º 27
0
                "stddev": 10 ** np.random.uniform(-5, 5)}

    # Randomly generate some hyperparameters, and launch a task for each set.
    for i in range(trials):
        hyperparameters = generate_hyperparameters()
        accuracy_id = objective.train_cnn_and_compute_accuracy.remote(
            hyperparameters, steps, train_images, train_labels,
            validation_images, validation_labels)
        remaining_ids.append(accuracy_id)
        # Keep track of which hyperparameters correspond to this experiment.
        hyperparameters_mapping[accuracy_id] = hyperparameters

    # Fetch and print the results of the tasks in the order that they complete.
    for i in range(trials):
        # Use ray.wait to get the object ID of the first task that completes.
        ready_ids, remaining_ids = ray.wait(remaining_ids)
        # Process the output of this task.
        result_id = ready_ids[0]
        hyperparameters = hyperparameters_mapping[result_id]
        accuracy, _ = ray.get(result_id)
        print("""We achieve accuracy {:.3}% with
            learning_rate: {:.2}
            batch_size: {}
            dropout: {:.2}
            stddev: {:.2}
          """.format(100 * accuracy,
                     hyperparameters["learning_rate"],
                     hyperparameters["batch_size"],
                     hyperparameters["dropout"],
                     hyperparameters["stddev"]))
        if accuracy > best_accuracy:
Ejemplo n.º 28
0
 def get_next_available_trial(self):
     [result_id], _ = ray.wait(list(self._running))
     return self._running[result_id]
Ejemplo n.º 29
0
 def get_model(self):
     ready, _ = ray.wait(
         [r.get_models.remote() for r in self.remote_workers])
     models = ray.get(ready[0])
     return models
Ejemplo n.º 30
0
def run(
    run_or_experiment: Union[str, Callable, Type],
    name: Optional[str] = None,
    metric: Optional[str] = None,
    mode: Optional[str] = None,
    stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], bool]] = None,
    time_budget_s: Union[None, int, float, datetime.timedelta] = None,
    config: Optional[Dict[str, Any]] = None,
    resources_per_trial: Union[
        None, Mapping[str, Union[float, int, Mapping]], PlacementGroupFactory
    ] = None,
    num_samples: int = 1,
    local_dir: Optional[str] = None,
    search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None,
    scheduler: Optional[Union[TrialScheduler, str]] = None,
    keep_checkpoints_num: Optional[int] = None,
    checkpoint_score_attr: Optional[str] = None,
    checkpoint_freq: int = 0,
    checkpoint_at_end: bool = False,
    verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
    progress_reporter: Optional[ProgressReporter] = None,
    log_to_file: bool = False,
    trial_name_creator: Optional[Callable[[Trial], str]] = None,
    trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
    sync_config: Optional[SyncConfig] = None,
    export_formats: Optional[Sequence] = None,
    max_failures: int = 0,
    fail_fast: bool = False,
    restore: Optional[str] = None,
    server_port: Optional[int] = None,
    resume: bool = False,
    reuse_actors: bool = False,
    trial_executor: Optional[RayTrialExecutor] = None,
    raise_on_failed_trial: bool = True,
    callbacks: Optional[Sequence[Callback]] = None,
    max_concurrent_trials: Optional[int] = None,
    # Deprecated args
    queue_trials: Optional[bool] = None,
    loggers: Optional[Sequence[Type[Logger]]] = None,
    _remote: Optional[bool] = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Many aspects of Tune, such as the frequency of global checkpointing,
    maximum pending placement group trials and the path of the result
    directory be configured through environment variables. Refer to
    :ref:`tune-env-vars` for a list of environment variables available.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict|PlacementGroupFactory): Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher|SearchAlgorithm|str): Search algorithm for
            optimization. You can also use the name of the algorithm.
        scheduler (TrialScheduler|str): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options. You can also use the
            name of the scheduler.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", "AUTO",
            or bool. "LOCAL"/True restores the checkpoint from the
            local experiment directory, determined
            by ``name`` and ``local_dir``. "REMOTE" restores the checkpoint
            from ``upload_dir`` (as passed to ``sync_config``).
            "PROMPT" provides the CLI feedback.
            False forces a new experiment. "ERRORED_ONLY" resets and reruns
            errored trials upon resume - previous trial artifacts will
            be left untouched.
            "AUTO" will attempt to resume from a checkpoint and otherwise
            start a new experiment.
            If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks (list): List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        max_concurrent_trials (int): Maximum number of trials to run
            concurrently. Must be non-negative. If None or 0, no limit will
            be applied. This is achieved by wrapping the ``search_alg`` in
            a :class:`ConcurrencyLimiter`, and thus setting this argument
            will raise an exception if the ``search_alg`` is already a
            :class:`ConcurrencyLimiter`. Defaults to None.
        _remote (bool): Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """

    # To be removed in 1.9.
    if queue_trials is not None:
        raise DeprecationWarning(
            "`queue_trials` has been deprecated and is replaced by "
            "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. "
            "Per default at least one Trial is queued at all times, "
            "so you likely don't need to change anything other than "
            "removing this argument from your call to `tune.run()`"
        )

    # Starting deprecation in ray 1.10.
    if os.environ.get("TUNE_TRIAL_RESULT_WAIT_TIME_S") is not None:
        warnings.warn("`TUNE_TRIAL_RESULT_WAIT_TIME_S` is deprecated.")

    if os.environ.get("TUNE_TRIAL_STARTUP_GRACE_PERIOD") is not None:
        warnings.warn("`TUNE_TRIAL_STARTUP_GRACE_PERIOD` is deprecated.")

    if os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S") is not None:
        warnings.warn("`TUNE_PLACEMENT_GROUP_WAIT_S` is deprecated.")

    # NO CODE IS TO BE ADDED ABOVE THIS COMMENT
    # remote_run_kwargs must be defined before any other
    # code is ran to ensure that at this point,
    # `locals()` is equal to args and kwargs
    remote_run_kwargs = locals().copy()
    remote_run_kwargs.pop("_remote")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run)

        # Make sure tune.run is called on the sever node.
        remote_run = force_on_current_node(remote_run)

        # JupyterNotebooks don't work with remote tune runs out of the box
        # (e.g. via Ray client) as they don't have access to the main
        # process stdout. So we introduce a queue here that accepts
        # callables, which will then be executed on the driver side.
        if isinstance(progress_reporter, JupyterNotebookReporter):
            execute_queue = Queue(
                actor_options={"num_cpus": 0, **force_on_current_node(None)}
            )
            progress_reporter.set_output_queue(execute_queue)

            def get_next_queue_item():
                try:
                    return execute_queue.get(block=False)
                except Empty:
                    return None

        else:
            # If we don't need a queue, use this dummy get fn instead of
            # scheduling an unneeded actor
            def get_next_queue_item():
                return None

        def _handle_execute_queue():
            execute_item = get_next_queue_item()
            while execute_item:
                if isinstance(execute_item, Callable):
                    execute_item()

                execute_item = get_next_queue_item()

        remote_future = remote_run.remote(_remote=False, **remote_run_kwargs)

        # ray.wait(...)[1] returns futures that are not ready, yet
        while ray.wait([remote_future], timeout=0.2)[1]:
            # Check if we have items to execute
            _handle_execute_queue()

        # Handle queue one last time
        _handle_execute_queue()

        return ray.get(remote_future)

    del remote_run_kwargs

    all_start = time.time()

    if loggers:
        # Raise DeprecationWarning in 1.9, remove in 1.10/1.11
        warnings.warn(
            "The `loggers` argument is deprecated. Please pass the respective "
            "`LoggerCallback` classes to the `callbacks` argument instead. "
            "See https://docs.ray.io/en/latest/tune/api_docs/logging.html"
        )

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']"
        )

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    result_buffer_length = None

    # Create scheduler here as we need access to some of its properties
    if isinstance(scheduler, str):
        # importing at top level causes a recursive dependency
        from ray.tune.schedulers import create_scheduler

        scheduler = create_scheduler(scheduler)
    scheduler = scheduler or FIFOScheduler()

    if not scheduler.supports_buffered_results:
        # Result buffering with e.g. a Hyperband scheduler is a bad idea, as
        # hyperband tries to stop trials when processing brackets. With result
        # buffering, we might trigger this multiple times when evaluating
        # a single trial, which leads to unexpected behavior.
        env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "")
        if env_result_buffer_length:
            warnings.warn(
                f"You are using a {type(scheduler)} scheduler, but "
                f"TUNE_RESULT_BUFFER_LENGTH is set "
                f"({env_result_buffer_length}). This can lead to undesired "
                f"and faulty behavior, so the buffer length was forcibly set "
                f"to 1 instead."
            )
        result_buffer_length = 1

    if (
        isinstance(scheduler, (PopulationBasedTraining, PopulationBasedTrainingReplay))
        and not reuse_actors
    ):
        warnings.warn(
            "Consider boosting PBT performance by enabling `reuse_actors` as "
            "well as implementing `reset_config` for Trainable."
        )

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, result_buffer_length=result_buffer_length
    )
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                sync_config=sync_config,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore,
            )
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if isinstance(search_alg, str):
        # importing at top level causes a recursive dependency
        from ray.tune.suggest import create_searcher

        search_alg = create_searcher(search_alg)

    # if local_mode=True is set during ray.init().
    is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE

    if is_local_mode:
        max_concurrent_trials = 1

    if not search_alg:
        search_alg = BasicVariantGenerator(max_concurrent=max_concurrent_trials or 0)
    elif max_concurrent_trials or is_local_mode:
        if isinstance(search_alg, ConcurrencyLimiter):
            if not is_local_mode:
                if search_alg.max_concurrent != max_concurrent_trials:
                    raise ValueError(
                        "You have specified `max_concurrent_trials="
                        f"{max_concurrent_trials}`, but the `search_alg` is "
                        "already a `ConcurrencyLimiter` with `max_concurrent="
                        f"{search_alg.max_concurrent}. FIX THIS by setting "
                        "`max_concurrent_trials=None`."
                    )
                else:
                    logger.warning(
                        "You have specified `max_concurrent_trials="
                        f"{max_concurrent_trials}`, but the `search_alg` is "
                        "already a `ConcurrencyLimiter`. "
                        "`max_concurrent_trials` will be ignored."
                    )
        else:
            if max_concurrent_trials < 1:
                raise ValueError(
                    "`max_concurrent_trials` must be greater or equal than 1, "
                    f"got {max_concurrent_trials}."
                )
            if isinstance(search_alg, Searcher):
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=max_concurrent_trials
                )
            elif not is_local_mode:
                logger.warning(
                    "You have passed a `SearchGenerator` instance as the "
                    "`search_alg`, but `max_concurrent_trials` requires a "
                    "`Searcher` instance`. `max_concurrent_trials` "
                    "will be ignored."
                )

    if isinstance(search_alg, Searcher):
        search_alg = SearchGenerator(search_alg)

    if config and not searcher_set_search_properties_backwards_compatible(
        search_alg.set_search_properties,
        metric,
        mode,
        config,
        **experiments[0].public_spec,
    ):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary."
            )

    if not scheduler_set_search_properties_backwards_compatible(
        scheduler.set_search_properties, metric, mode, **experiments[0].public_spec
    ):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`"
        )

    # Create syncer callbacks
    callbacks = create_default_callbacks(
        callbacks, sync_config, metric=metric, loggers=loggers
    )

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_config=sync_config,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric,
        # Driver should only sync trial checkpoints if
        # checkpoints are not synced to cloud
        driver_sync_trial_checkpoints=not bool(sync_config.upload_dir),
    )

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info(
            "TrialRunner resumed, ignoring new add_experiment but "
            "updating trial resources."
        )
        if resources_per_trial:
            runner.update_pending_trial_resources(resources_per_trial)

    progress_reporter = progress_reporter or detect_reporter()

    if not progress_reporter.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the reporter you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your reporter or from your call to `tune.run()`"
        )
    progress_reporter.set_total_samples(search_alg.total_samples)

    # Calls setup on callbacks
    runner.setup_experiments(
        experiments=experiments, total_num_samples=search_alg.total_samples
    )

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning(
                "Tune detects GPUs, but no trials are using GPUs. "
                "To enable trials to use GPUs, set "
                "tune.run(resources_per_trial={'gpu': 1}...) "
                "which allows Tune to expose 1 GPU to each trial. "
                "You can also override "
                "`Trainable.default_resource_request` if using the "
                "Trainable API."
            )

    original_handler = signal.getsignal(signal.SIGINT)
    state = {signal.SIGINT: False}

    def sigint_handler(sig, frame):
        logger.warning(
            "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. "
        )
        state[signal.SIGINT] = True
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    # We should only install the handler when it is safe to do so.
    # When tune.run() is called from worker thread, singal.signal will
    # fail.
    if threading.current_thread() != threading.main_thread():
        os.environ["TUNE_DISABLE_SIGINT_HANDLER"] = "1"

    if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
        signal.signal(signal.SIGINT, sigint_handler)

    tune_start = time.time()
    progress_reporter.set_start_time(tune_start)
    while not runner.is_finished() and not state[signal.SIGINT]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state[signal.SIGINT]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(
            f"Total run time: {all_taken:.2f} seconds "
            f"({tune_taken:.2f} seconds for the tuning loop)."
        )

    if state[signal.SIGINT]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`"
        )

    trials = runner.get_trials()
    return ExperimentAnalysis(
        runner.checkpoint_file,
        trials=trials,
        default_metric=metric,
        default_mode=mode,
        sync_config=sync_config,
    )
Ejemplo n.º 31
0
 def put():
     ray.wait([signal_actor.wait.remote()])
     return np.random.rand(5 * 1024 * 1024)  # 40 MB data
Ejemplo n.º 32
0
def test_atomic_creation(ray_start_cluster):
    # Setup cluster.
    cluster = ray_start_cluster
    bundle_cpu_size = 2
    bundle_per_node = 2
    num_nodes = 2

    [
        cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
        for _ in range(num_nodes)
    ]
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class NormalActor:
        def ping(self):
            pass

    @ray.remote(num_cpus=3)
    def bothering_task():
        time.sleep(6)
        return True

    # Schedule tasks to fail initial placement group creation.
    tasks = [bothering_task.remote() for _ in range(2)]

    # Make sure the two common task has scheduled.
    def tasks_scheduled():
        return ray.available_resources()["CPU"] == 2.0

    wait_for_condition(tasks_scheduled)

    # Create an actor that will fail bundle scheduling.
    # It is important to use pack strategy to make test less flaky.
    pg = ray.util.placement_group(
        name="name",
        strategy="SPREAD",
        bundles=[{
            "CPU": bundle_cpu_size
        } for _ in range(num_nodes * bundle_per_node)])

    # Create a placement group actor.
    # This shouldn't be scheduled because atomic
    # placement group creation should've failed.
    pg_actor = NormalActor.options(
        placement_group=pg,
        placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote()

    # Wait on the placement group now. It should be unready
    # because normal actor takes resources that are required
    # for one of bundle creation.
    ready, unready = ray.wait([pg.ready()], timeout=0.5)
    assert len(ready) == 0
    assert len(unready) == 1
    # Wait until all tasks are done.
    assert all(ray.get(tasks))

    # Wait on the placement group creation. Since resources are now available,
    # it should be ready soon.
    ready, unready = ray.wait([pg.ready()])
    assert len(ready) == 1
    assert len(unready) == 0

    # Confirm that the placement group actor is created. It will
    # raise an exception if actor was scheduled before placement
    # group was created thus it checks atomicity.
    ray.get(pg_actor.ping.remote(), timeout=3.0)
    ray.kill(pg_actor)

    # Make sure atomic creation failure didn't impact resources.
    @ray.remote(num_cpus=bundle_cpu_size)
    def resource_check():
        return True

    # This should hang because every resources
    # are claimed by placement group.
    check_without_pg = [
        resource_check.remote() for _ in range(bundle_per_node * num_nodes)
    ]

    # This all should scheduled on each bundle.
    check_with_pg = [
        resource_check.options(
            placement_group=pg, placement_group_bundle_index=i).remote()
        for i in range(bundle_per_node * num_nodes)
    ]

    # Make sure these are hanging.
    ready, unready = ray.wait(check_without_pg, timeout=0)
    assert len(ready) == 0
    assert len(unready) == bundle_per_node * num_nodes

    # Make sure these are all scheduled.
    assert all(ray.get(check_with_pg))

    ray.util.remove_placement_group(pg)

    def pg_removed():
        return ray.util.placement_group_table(pg)["state"] == "REMOVED"

    wait_for_condition(pg_removed)

    # Make sure check without pgs are all
    # scheduled properly because resources are cleaned up.
    assert all(ray.get(check_without_pg))
Ejemplo n.º 33
0
 iteration = 0
 while iteration != args.iterations:
     iteration += 1
     model_id = ray.put(model)
     actions = []
     # Launch tasks to compute gradients from multiple rollouts in parallel.
     start_time = time.time()
     # run rall_out for batch_size times
     for i in range(batch_size):
         # compute_gradient returns two variables, so action_id is a list
         action_id = actors[i].compute_gradient.remote(model_id)
         actions.append(action_id)
     for i in range(batch_size):
         # wait for one actor to finish its operation
         # action_id is the ready object id
         action_id, actions = ray.wait(actions)
         grad, reward_sum = ray.get(action_id[0])
         # Accumulate the gradient of each weight parameter over batch.
         for k in model:
             grad_buffer[k] += grad[k]
         running_reward = (reward_sum if running_reward is None else
                           running_reward * 0.99 + reward_sum * 0.01)
     end_time = time.time()
     print("Batch {} computed {} rollouts in {} seconds, "
           "running mean is {}".format(batch_num, batch_size,
                                       end_time - start_time,
                                       running_reward))
     # update gradient after one iteration
     for k, v in model.items():
         g = grad_buffer[k]
         rmsprop_cache[k] = (decay_rate * rmsprop_cache[k] +
Ejemplo n.º 34
0
 def time_wait_timeout(self, timeout):
     ray.wait([sleep.remote(0.5)], timeout=timeout)
Ejemplo n.º 35
0
 def time_wait_many_tasks(self, num_returns):
     tasks = [sleep.remote(i / 5) for i in range(4)]
     ray.wait(tasks, num_returns=num_returns)
Ejemplo n.º 36
0
 def time_wait_task(self):
     ray.wait([sleep.remote(0.1)])
Ejemplo n.º 37
0
        # Compute and apply gradients.
        # compute_tasks = [worker.compute_gradients.remote(current_weights) for worker in workers]
        fobj_to_workerID_dict = {} #mapping between remotefns to worker_ids
        compute_tasks = []
        
        for i in range(k):
            # tic = time.time()
            for worker_id in range(0,args.num_workers):
                worker = workers[worker_id]
                remotefn = worker.compute_gradients.remote(current_weights)
                compute_tasks.append(remotefn)
                fobj_to_workerID_dict[remotefn] = worker_id
            # toc = time.time()
            # print("Schedule task time: ", str(toc-tic))

        fast_function_ids, straggler_function_ids  = ray.wait(compute_tasks, num_returns=k)
        fast_worker_IDs = [fobj_to_workerID_dict[fastfn_id] for fastfn_id in fast_function_ids]
        straggler_worker_IDs = [fobj_to_workerID_dict[stragglerfn_id] for stragglerfn_id in straggler_function_ids]
        print(len(fast_function_ids), k, len(straggler_worker_IDs), fast_worker_IDs)
        fast_gradients = [ray.get(fast_id) for fast_id in fast_function_ids]

        current_weights = ps.apply_gradients.remote(*fast_gradients)
        net.variables.set_flat(ray.get(current_weights))
        test_xs, test_ys = mnist.test.next_batch(1000)
        accuracy = net.compute_accuracy(test_xs, test_ys)
        print("Iteration {} : accuracy is {}".format(iteration, accuracy))
        iteration += 1

        fast_function_ids, straggler_function_ids  = ray.wait(compute_tasks, num_returns=k*args.num_workers)
        print(len(straggler_function_ids))
Ejemplo n.º 38
0
def test_two_custom_resources(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(
        num_cpus=3, resources={
            "CustomResource1": 1,
            "CustomResource2": 2
        })
    custom_resource_node = cluster.add_node(
        num_cpus=3, resources={
            "CustomResource1": 3,
            "CustomResource2": 4
        })
    ray.init(address=cluster.address)

    @ray.remote
    def foo():
        # Sleep a while to emulate a slow operation. This is needed to make
        # sure tasks are scheduled to different nodes.
        time.sleep(0.1)
        return ray.worker.global_worker.node.unique_id

    # Make sure each node has at least one idle worker.
    wait_for_condition(
        lambda: len(set(ray.get([foo.remote() for _ in range(6)]))) == 2)

    @ray.remote(resources={"CustomResource1": 1})
    def f():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource2": 1})
    def g():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource1": 1, "CustomResource2": 3})
    def h():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource1": 4})
    def j():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    @ray.remote(resources={"CustomResource3": 1})
    def k():
        time.sleep(0.001)
        return ray.worker.global_worker.node.unique_id

    # The f and g tasks should be scheduled on both raylets.
    assert len(set(ray.get([f.remote() for _ in range(500)]))) == 2
    assert len(set(ray.get([g.remote() for _ in range(500)]))) == 2

    # The h tasks should be scheduled only on the second raylet.
    raylet_ids = set(ray.get([h.remote() for _ in range(50)]))
    assert len(raylet_ids) == 1
    assert list(raylet_ids)[0] == custom_resource_node.unique_id

    # Make sure that tasks with unsatisfied custom resource requirements do
    # not get scheduled.
    ready_ids, remaining_ids = ray.wait([j.remote(), k.remote()], timeout=0.5)
    assert ready_ids == []
Ejemplo n.º 39
0
def run_driver():
    output = run_string_as_driver(driver_script)
    assert "success" in output


iteration = 0
running_ids = [
    run_driver._remote(
        args=[], kwargs={}, num_cpus=0, resources={str(i): 0.01})
    for i in range(num_nodes)
]
start_time = time.time()
previous_time = start_time
while True:
    # Wait for a driver to finish and start a new driver.
    [ready_id], running_ids = ray.wait(running_ids, num_returns=1)
    ray.get(ready_id)

    running_ids.append(
        run_driver._remote(
            args=[],
            kwargs={},
            num_cpus=0,
            resources={str(iteration % num_nodes): 0.01}))

    new_time = time.time()
    print("Iteration {}:\n"
          "  - Iteration time: {}.\n"
          "  - Absolute time: {}.\n"
          "  - Total elapsed time: {}.".format(
              iteration, new_time - previous_time, new_time,
Ejemplo n.º 40
0
 def time_wait_many_tasks(self, num_returns):
     tasks = [sleep.remote(i / 5) for i in range(4)]
     ray.wait(tasks, num_returns=num_returns)
Ejemplo n.º 41
0
                        required=True)
    args = parser.parse_args()
    with open(args.config_path, "r") as config_file:
        config = defaultdict(dict)
        config.update(yaml.load(config_file, Loader=yaml.FullLoader))
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(config['gpu'])
    os.environ["QT_DEBUG_PLUGINS"] = "0"
    ray.init(webui_host='0.0.0.0', num_gpus=1, log_to_driver=False)

    # Preparation
    generation_config = dict(max_eps=10, num_actors=4)
    if 'generate' in config.keys():
        for key, value in config['generate'].items():
            assert key in generation_config.keys()
            generation_config[key] = value
    actors = make_remote_base(config, generation_config['num_actors'])

    # Start tasks
    generators = {}
    for a in actors:
        generators[a.test.remote()] = a

    # Generation process
    for _ in tqdm(range(generation_config['max_eps'])):
        ready_ids, _ = ray.wait(list(generators))
        first_id = ready_ids[0]
        first = generators.pop(first_id)
        generators[first.test.remote()] = first
    ray.timeline()
Ejemplo n.º 42
0
 def h(i):
     # Each instance of g submits and blocks on the result of another
     # remote task using ray.wait.
     object_refs = [f.remote(i, j) for j in range(2)]
     return ray.wait(object_refs, num_returns=len(object_refs))
Ejemplo n.º 43
0
 def step_run():
     ray.wait([signal_actor.send.remote()])
     with FileLock(lock_path):
         return None
Ejemplo n.º 44
0
def test_actor_multiple_gpus(ray_start_cluster):
    cluster = ray_start_cluster
    num_nodes = 3
    num_gpus_per_raylet = 5
    for i in range(num_nodes):
        cluster.add_node(num_cpus=10 * num_gpus_per_raylet,
                         num_gpus=num_gpus_per_raylet)
    ray.init(address=cluster.address)

    @ray.remote(num_gpus=2)
    class Actor1:
        def __init__(self):
            self.gpu_ids = ray.get_gpu_ids()

        def get_location_and_ids(self):
            assert ray.get_gpu_ids() == self.gpu_ids
            return (ray.worker.global_worker.node.unique_id,
                    tuple(self.gpu_ids))

    # Create some actors.
    actors1 = [Actor1.remote() for _ in range(num_nodes * 2)]
    # Make sure that no two actors are assigned to the same GPU.
    locations_and_ids = ray.get(
        [actor.get_location_and_ids.remote() for actor in actors1])
    node_names = {location for location, gpu_id in locations_and_ids}
    assert len(node_names) == num_nodes

    # Keep track of which GPU IDs are being used for each location.
    gpus_in_use = {node_name: [] for node_name in node_names}
    for location, gpu_ids in locations_and_ids:
        gpus_in_use[location].extend(gpu_ids)
    for node_name in node_names:
        assert len(set(gpus_in_use[node_name])) == 4

    # Creating a new actor should fail because all of the GPUs are being
    # used.
    a = Actor1.remote()
    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
    assert ready_ids == []

    # We should be able to create more actors that use only a single GPU.
    @ray.remote(num_gpus=1)
    class Actor2:
        def __init__(self):
            self.gpu_ids = ray.get_gpu_ids()

        def get_location_and_ids(self):
            return (ray.worker.global_worker.node.unique_id,
                    tuple(self.gpu_ids))

    # Create some actors.
    actors2 = [Actor2.remote() for _ in range(num_nodes)]
    # Make sure that no two actors are assigned to the same GPU.
    locations_and_ids = ray.get(
        [actor.get_location_and_ids.remote() for actor in actors2])
    names = {location for location, gpu_id in locations_and_ids}
    assert node_names == names
    for location, gpu_ids in locations_and_ids:
        gpus_in_use[location].extend(gpu_ids)
    for node_name in node_names:
        assert len(gpus_in_use[node_name]) == 5
        assert set(gpus_in_use[node_name]) == set(range(5))

    # Creating a new actor should fail because all of the GPUs are being
    # used.
    a = Actor2.remote()
    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
    assert ready_ids == []
Ejemplo n.º 45
0
 def time_wait_task(self):
     ray.wait([sleep.remote(0.1)])
Ejemplo n.º 46
0
def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
    cluster = ray_start_cluster
    num_nodes = 5
    num_gpus_per_raylet = 5
    for i in range(num_nodes):
        cluster.add_node(num_cpus=10 * num_gpus_per_raylet,
                         num_gpus=num_gpus_per_raylet,
                         _internal_config=json.dumps(
                             {"num_heartbeats_timeout": 1000}))
    ray.init(address=cluster.address)

    @ray.remote
    def create_actors(i, n):
        @ray.remote(num_gpus=1)
        class Actor:
            def __init__(self, i, j):
                self.gpu_ids = ray.get_gpu_ids()

            def get_location_and_ids(self):
                return ((ray.worker.global_worker.node.unique_id),
                        tuple(self.gpu_ids))

            def sleep(self):
                time.sleep(100)

        # Create n actors.
        actors = []
        for j in range(n):
            actors.append(Actor.remote(i, j))

        locations = ray.get(
            [actor.get_location_and_ids.remote() for actor in actors])

        # Put each actor to sleep for a long time to prevent them from getting
        # terminated.
        for actor in actors:
            actor.sleep.remote()

        return locations

    all_locations = ray.get([
        create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes)
    ])

    # Make sure that no two actors are assigned to the same GPU.
    node_names = {
        location
        for locations in all_locations for location, gpu_id in locations
    }
    assert len(node_names) == num_nodes

    # Keep track of which GPU IDs are being used for each location.
    gpus_in_use = {node_name: [] for node_name in node_names}
    for locations in all_locations:
        for location, gpu_ids in locations:
            gpus_in_use[location].extend(gpu_ids)
    for node_name in node_names:
        assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet

    @ray.remote(num_gpus=1)
    class Actor:
        def __init__(self):
            self.gpu_ids = ray.get_gpu_ids()

        def get_location_and_ids(self):
            return (ray.worker.global_worker.node.unique_id,
                    tuple(self.gpu_ids))

    # All the GPUs should be used up now.
    a = Actor.remote()
    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
    assert ready_ids == []
Ejemplo n.º 47
0
 def time_wait_timeout(self, timeout):
     ray.wait([sleep.remote(0.5)], timeout=timeout)
Ejemplo n.º 48
0
def test_actors_and_tasks_with_gpus(ray_start_cluster):
    cluster = ray_start_cluster
    num_nodes = 3
    num_gpus_per_raylet = 2
    for i in range(num_nodes):
        cluster.add_node(num_cpus=num_gpus_per_raylet,
                         num_gpus=num_gpus_per_raylet)
    ray.init(address=cluster.address)

    def check_intervals_non_overlapping(list_of_intervals):
        for i in range(len(list_of_intervals)):
            for j in range(i):
                first_interval = list_of_intervals[i]
                second_interval = list_of_intervals[j]
                # Check that list_of_intervals[i] and list_of_intervals[j]
                # don't overlap.
                assert first_interval[0] < first_interval[1]
                assert second_interval[0] < second_interval[1]
                intervals_nonoverlapping = (
                    first_interval[1] <= second_interval[0]
                    or second_interval[1] <= first_interval[0])
                assert intervals_nonoverlapping, (
                    "Intervals {} and {} are overlapping.".format(
                        first_interval, second_interval))

    @ray.remote(num_gpus=1)
    def f1():
        t1 = time.monotonic()
        time.sleep(0.1)
        t2 = time.monotonic()
        gpu_ids = ray.get_gpu_ids()
        assert len(gpu_ids) == 1
        assert gpu_ids[0] in range(num_gpus_per_raylet)
        return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
                [t1, t2])

    @ray.remote(num_gpus=2)
    def f2():
        t1 = time.monotonic()
        time.sleep(0.1)
        t2 = time.monotonic()
        gpu_ids = ray.get_gpu_ids()
        assert len(gpu_ids) == 2
        assert gpu_ids[0] in range(num_gpus_per_raylet)
        assert gpu_ids[1] in range(num_gpus_per_raylet)
        return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
                [t1, t2])

    @ray.remote(num_gpus=1)
    class Actor1:
        def __init__(self):
            self.gpu_ids = ray.get_gpu_ids()
            assert len(self.gpu_ids) == 1
            assert self.gpu_ids[0] in range(num_gpus_per_raylet)

        def get_location_and_ids(self):
            assert ray.get_gpu_ids() == self.gpu_ids
            return (ray.worker.global_worker.node.unique_id,
                    tuple(self.gpu_ids))

    def locations_to_intervals_for_many_tasks():
        # Launch a bunch of GPU tasks.
        locations_ids_and_intervals = ray.get(
            [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
            [f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
            [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)])

        locations_to_intervals = collections.defaultdict(lambda: [])
        for location, gpu_ids, interval in locations_ids_and_intervals:
            for gpu_id in gpu_ids:
                locations_to_intervals[(location, gpu_id)].append(interval)
        return locations_to_intervals

    # Run a bunch of GPU tasks.
    locations_to_intervals = locations_to_intervals_for_many_tasks()
    # For each GPU, verify that the set of tasks that used this specific
    # GPU did not overlap in time.
    for locations in locations_to_intervals:
        check_intervals_non_overlapping(locations_to_intervals[locations])

    # Create an actor that uses a GPU.
    a = Actor1.remote()
    actor_location = ray.get(a.get_location_and_ids.remote())
    actor_location = (actor_location[0], actor_location[1][0])
    # This check makes sure that actor_location is formatted the same way
    # that the keys of locations_to_intervals are formatted.
    assert actor_location in locations_to_intervals

    # Run a bunch of GPU tasks.
    locations_to_intervals = locations_to_intervals_for_many_tasks()
    # For each GPU, verify that the set of tasks that used this specific
    # GPU did not overlap in time.
    for locations in locations_to_intervals:
        check_intervals_non_overlapping(locations_to_intervals[locations])
    # Make sure that the actor's GPU was not used.
    assert actor_location not in locations_to_intervals

    # Create more actors to fill up all the GPUs.
    more_actors = [
        Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1)
    ]
    # Wait for the actors to finish being created.
    ray.get([actor.get_location_and_ids.remote() for actor in more_actors])

    # Now if we run some GPU tasks, they should not be scheduled.
    results = [f1.remote() for _ in range(30)]
    ready_ids, remaining_ids = ray.wait(results, timeout=1.0)
    assert len(ready_ids) == 0
Ejemplo n.º 49
0
def test_wait(ray_start_regular):
    @ray.remote
    def f(delay):
        time.sleep(delay)
        return

    object_ids = [f.remote(0), f.remote(0), f.remote(0), f.remote(0)]
    ready_ids, remaining_ids = ray.wait(object_ids)
    assert len(ready_ids) == 1
    assert len(remaining_ids) == 3
    ready_ids, remaining_ids = ray.wait(object_ids, num_returns=4)
    assert set(ready_ids) == set(object_ids)
    assert remaining_ids == []

    object_ids = [f.remote(0), f.remote(5)]
    ready_ids, remaining_ids = ray.wait(object_ids, timeout=0.5, num_returns=2)
    assert len(ready_ids) == 1
    assert len(remaining_ids) == 1

    # Verify that calling wait with duplicate object IDs throws an
    # exception.
    x = ray.put(1)
    with pytest.raises(Exception):
        ray.wait([x, x])

    # Make sure it is possible to call wait with an empty list.
    ready_ids, remaining_ids = ray.wait([])
    assert ready_ids == []
    assert remaining_ids == []

    # Test semantics of num_returns with no timeout.
    oids = [ray.put(i) for i in range(10)]
    (found, rest) = ray.wait(oids, num_returns=2)
    assert len(found) == 2
    assert len(rest) == 8

    # Verify that incorrect usage raises a TypeError.
    x = ray.put(1)
    with pytest.raises(TypeError):
        ray.wait(x)
    with pytest.raises(TypeError):
        ray.wait(1)
    with pytest.raises(TypeError):
        ray.wait([1])
Ejemplo n.º 50
0
def test_redeploy_single_replica(serve_instance, use_handle):
    # Tests that redeploying a deployment with a single replica waits for the
    # replica to completely shut down before starting a new one.
    client = serve_instance

    name = "test"

    @ray.remote
    def call(block=False):
        if use_handle:
            handle = serve.get_deployment(name).get_handle()
            ret = ray.get(handle.handler.remote(block))
        else:
            ret = requests.get(f"http://*****:*****@serve.deployment(name=name, version="1")
    class V1:
        async def handler(self, block: bool):
            if block:
                signal = ray.get_actor(signal_name)
                await signal.wait.remote()

            return f"1|{os.getpid()}"

        async def __call__(self, request):
            return await self.handler(request.query_params["block"] == "True")

    class V2:
        async def handler(self, *args):
            return f"2|{os.getpid()}"

        async def __call__(self, request):
            return await self.handler()

    V1.deploy()
    ref1 = call.remote(block=False)
    val1, pid1 = ray.get(ref1)
    assert val1 == "1"

    # ref2 will block until the signal is sent.
    ref2 = call.remote(block=True)
    assert len(ray.wait([ref2], timeout=2.1)[0]) == 0

    # Redeploy new version. This should not go through until the old version
    # replica completely stops.
    V2 = V1.options(func_or_class=V2, version="2")
    V2.deploy(_blocking=False)
    with pytest.raises(TimeoutError):
        client._wait_for_deployment_healthy(V2.name, timeout_s=0.1)

    # It may take some time for the handle change to propagate and requests
    # to get sent to the new version. Repeatedly send requests until they
    # start blocking
    start = time.time()
    new_version_ref = None
    while time.time() - start < 30:
        ready, not_ready = ray.wait([call.remote(block=False)], timeout=5)
        if len(ready) == 1:
            # If the request doesn't block, it must have been the old version.
            val, pid = ray.get(ready[0])
            assert val == "1"
            assert pid == pid1
        elif len(not_ready) == 1:
            # If the request blocks, it must have been the new version.
            new_version_ref = not_ready[0]
            break
    else:
        assert False, "Timed out waiting for new version to be called."

    # Signal the original call to exit.
    ray.get(signal.send.remote())
    val2, pid2 = ray.get(ref2)
    assert val2 == "1"
    assert pid2 == pid1

    # Now the goal and request to the new version should complete.
    client._wait_for_deployment_healthy(V2.name)
    new_version_val, new_version_pid = ray.get(new_version_ref)
    assert new_version_val == "2"
    assert new_version_pid != pid2
Ejemplo n.º 51
0
def main():
    parser = argparse.ArgumentParser(description='run parllel data collection')
    parser.add_argument('experiment', type=str, help='experiment name')
    parser.add_argument('--nworkers',
                        type=int,
                        help='use multiple threads or not',
                        default=1)
    parser.add_argument('--gpu_id',
                        type=int,
                        help='the starting gpu_id',
                        default=0)
    parser.add_argument('--ngpu',
                        type=int,
                        help='the number of gpus to use',
                        default=1)

    parser.add_argument('--nsplit',
                        type=int,
                        help='number of splits',
                        default=-1)
    parser.add_argument('--isplit', type=int, help='split id', default=-1)
    parser.add_argument('--cloud',
                        dest='cloud',
                        action='store_true',
                        default=False)
    parser.add_argument('--benchmark',
                        dest='do_benchmark',
                        action='store_true',
                        default=False)

    parser.add_argument('--iex',
                        type=int,
                        help='if different from -1 use only do example',
                        default=-1)

    args = parser.parse_args()
    hyperparams_file = args.experiment
    gpu_id = args.gpu_id

    n_worker = args.nworkers
    if args.nworkers == 1:
        parallel = False
    else:
        parallel = True
    print('parallel ', bool(parallel))

    loader = importlib.machinery.SourceFileLoader('mod_hyper',
                                                  hyperparams_file)
    spec = importlib.util.spec_from_loader(loader.name, loader)
    mod = importlib.util.module_from_spec(spec)
    loader.exec_module(mod)
    hyperparams = mod.config

    if args.nsplit != -1:
        assert args.isplit >= 0 and args.isplit < args.nsplit, "isplit should be in [0, nsplit-1]"

        n_persplit = max(
            (hyperparams['end_index'] + 1 - hyperparams['start_index']) /
            args.nsplit, 1)
        hyperparams['end_index'] = int((args.isplit + 1) * n_persplit +
                                       hyperparams['start_index'] - 1)
        hyperparams['start_index'] = int(args.isplit * n_persplit +
                                         hyperparams['start_index'])

    n_traj = hyperparams['end_index'] - hyperparams['start_index'] + 1
    traj_per_worker = int(n_traj // np.float32(n_worker))
    start_idx = [
        hyperparams['start_index'] + traj_per_worker * i
        for i in range(n_worker)
    ]
    end_idx = [
        hyperparams['start_index'] + traj_per_worker * (i + 1) - 1
        for i in range(n_worker)
    ]

    if 'gen_xml' in hyperparams['agent']:  #remove old auto-generated xml files
        try:
            os.system("rm {}".format('/'.join(
                str.split(hyperparams['agent']['filename'], '/')[:-1]) +
                                     '/auto_gen/*'))
        except:
            pass

    if args.do_benchmark:
        use_worker = bench_worker
    else:
        use_worker = worker

    if 'RESULT_DIR' in os.environ:
        if 'exp_name' in hyperparams:
            exp_name = hyperparams['exp_name']
        elif 'record' in hyperparams['agent']:
            exp_name = [
                f for f in hyperparams['agent']['record'].split('/')
                if f != 'record' and len(f) > 0
            ][-1]
        elif 'data_save_dir' in hyperparams['agent']:
            exp_name = hyperparams['agent']['data_save_dir'].split('/')[-1]
        else:
            raise NotImplementedError("can't find exp name")
        result_dir = '{}/{}'.format(os.environ['RESULT_DIR'], exp_name)

        if 'verbose' in hyperparams['policy'] and not os.path.exists(
                result_dir + '/verbose'):
            os.makedirs(result_dir + '/verbose')

        if 'data_save_dir' in hyperparams['agent']:
            hyperparams['agent']['data_save_dir'] = result_dir

    elif 'EXPERIMENT_DIR' in os.environ:
        subpath = hyperparams['current_dir'].partition('experiments')[2]
        result_dir = os.path.join(os.environ['EXPERIMENT_DIR'] + subpath)
    elif args.cloud:
        check_and_pop(hyperparams, 'save_raw_images')
        check_and_pop(hyperparams['agent'], 'make_final_gif')
        check_and_pop(hyperparams['agent'], 'make_final_gif_pointoverlay')
        hyperparams['agent'][
            'data_save_dir'] = '/result/'  # by default save code to the /result folder in docker image
    else:
        result_dir = hyperparams['current_dir'] + '/verbose'

    if 'master_datadir' in hyperparams['agent']:
        ray.init()
        sync_todo_id = sync.remote(hyperparams['agent'])
        print('launched sync')

    if 'data_save_dir' in hyperparams['agent']:
        record_queue, record_saver_proc, counter = prepare_saver(hyperparams)

    if args.iex != -1:
        hyperparams['agent']['iex'] = args.iex

    conflist = []
    for i in range(n_worker):
        modconf = copy.deepcopy(hyperparams)
        modconf['start_index'] = start_idx[i]
        modconf['end_index'] = end_idx[i]
        modconf['ntraj'] = n_traj
        modconf['gpu_id'] = i + gpu_id
        modconf['result_dir'] = result_dir
        if 'data_save_dir' in hyperparams['agent']:
            modconf['record_saver'] = record_queue
            modconf['counter'] = counter
        conflist.append(modconf)
    if parallel:
        p = Pool(n_worker)
        p.map(use_worker, conflist)
    else:
        use_worker(conflist[0], args.iex, args.ngpu)

    if 'data_save_dir' in hyperparams['agent'] and not hyperparams.get(
            'save_raw_images', False):
        record_queue.put(
            None
        )  # send flag to background thread that it can end saving after it's done
        record_saver_proc.join()  # joins thread and continues execution

    if 'master_datadir' in hyperparams['agent']:
        ray.wait([sync_todo_id])

    if args.do_benchmark:
        pdb.set_trace()
        combine_scores(hyperparams, result_dir)
        sys.exit()
Ejemplo n.º 52
0
 def g(l):
     # The argument l should be a list containing one object ID.
     ray.wait([l[0]])
Ejemplo n.º 53
0
 def child_resource_available():
     p = probe.remote()
     ready, _ = ray.wait([p], timeout=1)
     return len(ready) > 0
Ejemplo n.º 54
0
    def poll(self) -> Tuple[MultiEnvDict, MultiEnvDict, MultiEnvDict,
                            MultiEnvDict, MultiEnvDict]:
        if self.actors is None:
            # `self.make_env` already produces Actors: Use it directly.
            if len(self.existing_envs) > 0 and isinstance(
                    self.existing_envs[0], ray.actor.ActorHandle):
                self.make_env_creates_actors = True
                self.actors = []
                while len(self.actors) < self.num_envs:
                    self.actors.append(self.make_env(len(self.actors)))
            # `self.make_env` produces gym.Envs (or children thereof, such
            # as MultiAgentEnv): Need to auto-wrap it here. The problem with
            # this is that custom methods wil get lost. If you would like to
            # keep your custom methods in your envs, you should provide the
            # env class directly in your config (w/o tune.register_env()),
            # such that your class will directly be made a @ray.remote
            # (w/o the wrapping via `_Remote[Multi|Single]AgentEnv`).
            else:

                def make_remote_env(i):
                    logger.info("Launching env {} in remote actor".format(i))
                    if self.multiagent:
                        return _RemoteMultiAgentEnv.remote(self.make_env, i)
                    else:
                        return _RemoteSingleAgentEnv.remote(self.make_env, i)

                self.actors = [
                    make_remote_env(i) for i in range(self.num_envs)
                ]

        if self.pending is None:
            self.pending = {a.reset.remote(): a for a in self.actors}

        # each keyed by env_id in [0, num_remote_envs)
        obs, rewards, dones, infos = {}, {}, {}, {}
        ready = []

        # Wait for at least 1 env to be ready here
        while not ready:
            ready, _ = ray.wait(
                list(self.pending),
                num_returns=len(self.pending),
                timeout=self.poll_timeout)

        # Get and return observations for each of the ready envs
        env_ids = set()
        for obj_ref in ready:
            actor = self.pending.pop(obj_ref)
            env_id = self.actors.index(actor)
            env_ids.add(env_id)
            ret = ray.get(obj_ref)
            # Our sub-envs are simple Actor-turned gym.Envs or MultiAgentEnvs.
            if self.make_env_creates_actors:
                rew, done, info = None, None, None
                if self.multiagent:
                    if isinstance(ret, tuple) and len(ret) == 4:
                        ob, rew, done, info = ret
                    else:
                        ob = ret
                else:
                    if isinstance(ret, tuple) and len(ret) == 4:
                        ob = {_DUMMY_AGENT_ID: ret[0]}
                        rew = {_DUMMY_AGENT_ID: ret[1]}
                        done = {_DUMMY_AGENT_ID: ret[2], "__all__": ret[2]}
                        info = {_DUMMY_AGENT_ID: ret[3]}
                    else:
                        ob = {_DUMMY_AGENT_ID: ret}

                if rew is None:
                    rew = {agent_id: 0 for agent_id in ob.keys()}
                    done = {"__all__": False}
                    info = {agent_id: {} for agent_id in ob.keys()}
            # Our sub-envs are auto-wrapped and already behave like multi-agent
            # envs.
            else:
                ob, rew, done, info = ret
            obs[env_id] = ob
            rewards[env_id] = rew
            dones[env_id] = done
            infos[env_id] = info

        logger.debug("Got obs batch for actors {}".format(env_ids))
        return obs, rewards, dones, infos, {}
Ejemplo n.º 55
0
        return ""


serve.init()
serve.create_endpoint("magic_counter", "/counter")
# specify max_batch_size in BackendConfig
backend_config = {"max_batch_size": 5}
serve.create_backend("counter:v1", MagicCounter, 42,
                     config=backend_config)  # increment=42
print("Backend Config for backend: 'counter:v1'")
print(backend_config)
serve.set_traffic("magic_counter", {"counter:v1": 1.0})

handle = serve.get_handle("magic_counter")
future_list = []

# fire 30 requests
for r in range(30):
    print("> [REMOTE] Pinging handle.remote(base_number={})".format(r))
    f = handle.remote(base_number=r)
    future_list.append(f)

# get results of queries as they complete
left_futures = future_list
while left_futures:
    completed_futures, remaining_futures = ray.wait(left_futures, timeout=0.05)
    if len(completed_futures) > 0:
        result = ray.get(completed_futures[0])
        print("< " + result)
    left_futures = remaining_futures
Ejemplo n.º 56
0
running_reward = None
# "Xavier" initialization.
# Update buffers that add up gradients over a batch.
grad_buffer = {k: np.zeros_like(v) for k, v in model.weights.items()}
# Update the rmsprop memory.
rmsprop_cache = {k: np.zeros_like(v) for k, v in model.weights.items()}

for i in range(1, 1 + iterations):
    model_id = ray.put(model)
    gradient_ids = []
    # Launch tasks to compute gradients from multiple rollouts in parallel.
    start_time = time.time()
    gradient_ids = [
        actor.compute_gradient.remote(model_id) for actor in actors
    ]
    for batch in range(batch_size):
        [grad_id], gradient_ids = ray.wait(gradient_ids)
        grad, reward_sum = ray.get(grad_id)
        # Accumulate the gradient over batch.
        for k in model.weights:
            grad_buffer[k] += grad[k]
        running_reward = (reward_sum if running_reward is None else
                          running_reward * 0.99 + reward_sum * 0.01)
    end_time = time.time()
    print("Batch {} computed {} rollouts in {} seconds, "
          "running mean is {}".format(i, batch_size, end_time - start_time,
                                      running_reward))
    model.update(grad_buffer, rmsprop_cache, learning_rate, decay_rate)
    zero_grads(grad_buffer)
def main(config, list_of_files, frame_sampling, save_frames, start_time,
         end_time):
    Linda_interface = "http://172.17.26.95:8086/status/linda_orange_material_info?"
    nodes = list(set(get_ray_nodes()))
    print(nodes)

    config = resolve_config(config_path=config,
                            frame_sampling=frame_sampling,
                            save_frames=save_frames)

    if LOCAL_TEST:
        local_test(config)
        os.popen('python /project/generate_matches.py')
        return

    schedule.every(3600).seconds.do(check_matchs, config)
    schedule.every(1200).seconds.do(check_convert, config)

    startTime = start_time
    result_ids = []
    prepare_to_end = False
    video_num_total = 0
    while end_time - startTime > 0:
        cur_time = int(time.time())
        if startTime >= cur_time:
            startTime = start_time
            time.sleep(5)
        linda_request_url = Linda_interface + "starttime=" + str(
            startTime) + "&&" + "endtime=" + str((startTime + 600))
        startTime += 600
        if startTime >= end_time:
            prepare_to_end = True
        print(linda_request_url)
        print("cur_time: " + str(cur_time))
        r = requests.get(linda_request_url)
        rsp = json.loads(r.text)
        if rsp['result'] == "success" and range(len(rsp['data']) > 0):
            linda_list = [
                rsp['data'][i]['file_path'] for i in range(len(rsp['data']))
            ]
            linda_list_temp = linda_list.copy()
            video_num_total += len(linda_list_temp)
            record_video_list(linda_request_url, linda_list_temp)

            for idx, value in enumerate(linda_list_temp):
                link = value
                is_in_db = is_video_exist_in_db(config, link.split('/')[-1])
                if not is_in_db:
                    while True:
                        try:
                            if int(ray.available_resources().get("CPU",
                                                                 0)) > 1:
                                if prepare_to_end:
                                    task_id = extract_features.remote(
                                        config, link)
                                    result_ids.append(task_id)
                                else:
                                    extract_features.remote(config, link)

                                break
                        except Exception as e:
                            print(e)
                        schedule.run_pending()
                        time.sleep(2)

            print("Total: " + str(video_num_total) + ", result_ids: " +
                  str(len(result_ids)))

    print("task dis done!")

    count = 0
    while len(result_ids) and count < 100:
        _, result_ids = ray.wait(result_ids)
        print("result_ids:" + str(len(result_ids)))
        count += 1
        time.sleep(2)

    check_convert(config)
    ray.get(
        find_matchs.options(num_cpus=0, resources={
            f"node:{head_ip}": 0.01
        }).remote(config))

    print("All task Done!!")
Ejemplo n.º 58
0
def test_submit_api(shutdown_only):
    ray.init(num_cpus=2, num_gpus=1, resources={"Custom": 1})

    @ray.remote
    def f(n):
        return list(range(n))

    @ray.remote
    def g():
        return ray.get_gpu_ids()

    assert f._remote([0], num_returns=0) is None
    id1 = f._remote(args=[1], num_returns=1)
    assert ray.get(id1) == [0]
    id1, id2 = f._remote(args=[2], num_returns=2)
    assert ray.get([id1, id2]) == [0, 1]
    id1, id2, id3 = f._remote(args=[3], num_returns=3)
    assert ray.get([id1, id2, id3]) == [0, 1, 2]
    assert ray.get(
        g._remote(args=[], num_cpus=1, num_gpus=1, resources={"Custom":
                                                              1})) == [0]
    infeasible_id = g._remote(args=[], resources={"NonexistentCustom": 1})
    assert ray.get(g._remote()) == []
    ready_ids, remaining_ids = ray.wait([infeasible_id], timeout=0.05)
    assert len(ready_ids) == 0
    assert len(remaining_ids) == 1

    # Check mismatch with num_returns.
    with pytest.raises(ValueError):
        ray.get(f.options(num_returns=2).remote(3))
    with pytest.raises(ValueError):
        ray.get(f.options(num_returns=3).remote(2))

    @ray.remote
    class Actor:
        def __init__(self, x, y=0):
            self.x = x
            self.y = y

        def method(self, a, b=0):
            return self.x, self.y, a, b

        def gpu_ids(self):
            return ray.get_gpu_ids()

    @ray.remote
    class Actor2:
        def __init__(self):
            pass

        def method(self):
            pass

    a = Actor._remote(args=[0],
                      kwargs={"y": 1},
                      num_gpus=1,
                      resources={"Custom": 1})

    a2 = Actor2._remote()
    ray.get(a2.method._remote())

    id1, id2, id3, id4 = a.method._remote(args=["test"],
                                          kwargs={"b": 2},
                                          num_returns=4)
    assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2]
Ejemplo n.º 59
0
 def block_in_wait(object_ref_in_list):
     ray.wait(object_ref_in_list)
Ejemplo n.º 60
0
 def block_in_wait(object_id_in_list):
     ray.wait(object_id_in_list)