def test_high_load(self): workers = [ RemoteRLlibActor.remote(sleep_time=random.random() * 2.0) for _ in range(60) ] manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=2, return_object_refs=True, ray_wait_timeout_s=0.0, ) num_ready = 0 for i in range(2000): manager.call_on_all_available(lambda w: w.task()) time.sleep(0.01) ready = manager.get_ready() for reqs in ready.values(): num_ready += len(reqs) ray.get(reqs) for worker in ready.keys(): worker.task2.remote(1, 3) time.sleep(20) ready = manager.get_ready() num_ready += sum(len(reqs) for reqs in ready.values()) actually_called = sum( ray.get( [worker.apply.remote(lambda w: w.num_task_called) for worker in workers] ) ) assert actually_called == num_ready, (actually_called, num_ready)
def setup(self, config: PartialTrainerConfigDict): super().setup(config) # Initialize torch process group for if self.config["_disable_execution_plan_api"] is True: self._curr_learner_info = {} ip = ray.get(self.workers.remote_workers()[0].get_node_ip.remote()) port = ray.get( self.workers.remote_workers()[0].find_free_port.remote()) address = "tcp://{ip}:{port}".format(ip=ip, port=port) logger.info( "Creating torch process group with leader {}".format(address)) # Get setup tasks in order to throw errors on failure. ray.get([ worker.setup_torch_data_parallel.remote( url=address, world_rank=i, world_size=len(self.workers.remote_workers()), backend=self.config["torch_distributed_backend"], ) for i, worker in enumerate(self.workers.remote_workers()) ]) logger.info("Torch process group init completed") self._ddppo_worker_manager = AsyncRequestsManager( self.workers.remote_workers(), max_remote_requests_in_flight_per_worker=1, ray_wait_timeout_s=0.03, )
def test_test_async_requests_task_doesnt_buffering(self): """Tests that the async manager drops""" workers = [RemoteRLlibActor.remote(sleep_time=0.1) for _ in range(2)] manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=2 ) for i in range(8): scheduled = manager.call(lambda w: w.task()) if i < 4: assert scheduled, "We should have scheduled the task" else: assert not scheduled, ( "We should not have scheduled the task because" " all workers are busy." ) assert len(manager._pending_remotes) == 4, "We should have 4 pending requests" time.sleep(3) ready_requests = manager.get_ready() for worker in workers: if not len(ready_requests[worker]) == 2: raise Exception( "We should return the 2 ready requests in this case from each " "actors." ) for _ in range(4): manager.call(lambda w: w.task()) # new tasks scheduled from the buffer time.sleep(3) ready_requests = manager.get_ready() for worker in workers: if not len(ready_requests[worker]) == 2: raise Exception( "We should return the 2 ready requests in this case from each " "actors" )
def test_add_remove_actors(self): """Tests that the async manager can properly add and remove actors""" workers = [] manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=2 ) if not ( ( len(manager._all_workers) == len(manager._remote_requests_in_flight) == len(manager._pending_to_actor) == len(manager._pending_remotes) == 0 ) ): raise ValueError("We should have no workers in this case.") assert not manager.call(lambda w: w.task()), ( "Task shouldn't have been " "launched since there are no " "workers in the manager." ) worker = RemoteRLlibActor.remote(sleep_time=0.1) manager.add_workers(worker) manager.call(lambda w: w.task()) if not ( len(manager._remote_requests_in_flight[worker]) == len(manager._pending_to_actor) == len(manager._all_workers) == len(manager._pending_remotes) == 1 ): raise ValueError("We should have 1 worker and 1 pending request") time.sleep(3) manager.get_ready() # test worker removal for i in range(2): manager.call(lambda w: w.task()) assert len(manager._pending_remotes) == i + 1 manager.remove_workers(worker) if not ((len(manager._all_workers) == 0)): raise ValueError("We should have no workers that we can schedule tasks to") if not ( (len(manager._pending_remotes) == 2 and len(manager._pending_to_actor) == 2) ): raise ValueError( "We should still have 2 pending requests in flight from the worker" ) time.sleep(3) result = manager.get_ready() if not ( len(result) == 1 and len(result[worker]) == 2 and len(manager._pending_remotes) == 0 and len(manager._pending_to_actor) == 0 ): raise ValueError( "We should have 2 ready results from the worker and no pending requests" )
def test_call_to_actor(self): workers = [RemoteRLlibActor.remote(sleep_time=0.1) for _ in range(2)] worker_not_in_manager = RemoteRLlibActor.remote(sleep_time=0.1) manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=2) manager.call(lambda w: w.task(), actor=workers[0]) time.sleep(3) results = manager.get_ready() if not len(results) == 1 and workers[0] not in results: raise Exception( "We should return the 1 ready requests in this case from the worker we " "called to") with pytest.raises(ValueError, match=".*has not been added to the manager.*"): manager.call(lambda w: w.task(), actor=worker_not_in_manager)
def test_round_robin_scheduling(self): """Test that the async manager schedules actors in a round robin fashion""" workers = [RemoteRLlibActor.remote(sleep_time=0.1) for _ in range(2)] manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=2 ) for i in range(4): scheduled_actor = workers[i % len(workers)] manager.call(lambda w: w.task()) if i < 2: assert len(manager._remote_requests_in_flight[scheduled_actor]) == 1, ( "We should have 1 request in flight for the actor that we just " "scheduled on" ) else: assert len(manager._remote_requests_in_flight[scheduled_actor]) == 2, ( "We should have 2 request in flight for the actor that we just " "scheduled on" )
def test_async_requests_manager_num_returns(self): """Tests that an async manager can properly handle actors with tasks that vary in the amount of time that they take to run""" workers = [RemoteRLlibActor.remote(sleep_time=0.1) for _ in range(2)] workers += [RemoteRLlibActor.remote(sleep_time=5) for _ in range(2)] manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=1) for _ in range(4): manager.call(lambda w: w.task()) time.sleep(3) if not len(manager.get_ready()) == 2: raise Exception( "We should return the 2 ready requests in this case from the actors" " that have shorter tasks") time.sleep(7) if not len(manager.get_ready()) == 2: raise Exception( "We should return the 2 ready requests in this case from the actors" " that have longer tasks")
def test_args_kwargs(self): """Tests that the async manager can properly handle actors with tasks that vary in the amount of time that they take to run""" workers = [RemoteRLlibActor.remote(sleep_time=0.1)] manager = AsyncRequestsManager( workers, max_remote_requests_in_flight_per_worker=2) for _ in range(2): manager.call(lambda w, a, b: w.task2(a, b), fn_args=[1, 2]) time.sleep(3) if not len(manager.get_ready()[workers[0]]) == 2: raise Exception( "We should return the 2 ready requests in this case from the actors" " that have shorter tasks") for _ in range(2): manager.call(lambda w, a, b: w.task2(a, b), fn_kwargs=dict(a=1, b=2)) time.sleep(3) if not len(manager.get_ready()[workers[0]]) == 2: raise Exception( "We should return the 2 ready requests in this case from the actors" " that have longer tasks")
def setup(self, config: PartialAlgorithmConfigDict): super().setup(config) if self.config["_disable_execution_plan_api"]: # Create extra aggregation workers and assign each rollout worker to # one of them. self.batches_to_place_on_learner = [] self.batch_being_built = [] if self.config["num_aggregation_workers"] > 0: # This spawns `num_aggregation_workers` actors that aggregate # experiences coming from RolloutWorkers in parallel. We force # colocation on the same node (localhost) to maximize data bandwidth # between them and the learner. localhost = platform.node() assert localhost != "", ( "ERROR: Cannot determine local node name! " "`platform.node()` returned empty string.") all_co_located = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count=1) ( AggregatorWorker, [ self.config, ], {}, self.config["num_aggregation_workers"], ) ], node=localhost, ) self._aggregator_workers = [ actor for actor_groups in all_co_located for actor in actor_groups ] self._aggregator_actor_manager = AsyncRequestsManager( self._aggregator_workers, max_remote_requests_in_flight_per_worker=self. config["max_requests_in_flight_per_aggregator_worker"], ray_wait_timeout_s=self. config["timeout_s_aggregator_manager"], ) else: # Create our local mixin buffer if the num of aggregation workers is 0. self.local_mixin_buffer = MixInMultiAgentReplayBuffer( capacity=(self.config["replay_buffer_num_slots"] if self.config["replay_buffer_num_slots"] > 0 else 1), replay_ratio=self.config["replay_ratio"], replay_mode=ReplayMode.LOCKSTEP, ) self._sampling_actor_manager = AsyncRequestsManager( self.workers.remote_workers(), max_remote_requests_in_flight_per_worker=self. config["max_requests_in_flight_per_sampler_worker"], return_object_refs=True, ray_wait_timeout_s=self.config["timeout_s_sampler_manager"], ) # Create and start the learner thread. self._learner_thread = make_learner_thread( self.workers.local_worker(), self.config) self._learner_thread.start() self.workers_that_need_updates = set()
def setup(self, config: PartialAlgorithmConfigDict): super().setup(config) # Shortcut: If execution_plan, thread and buffer will be created in there. if self.config["_disable_execution_plan_api"] is False: return # Tag those workers (top 1/3rd indices) that we should collect episodes from # for metrics due to `PerWorkerEpsilonGreedy` exploration strategy. if self.workers.remote_workers(): self._remote_workers_for_metrics = self.workers.remote_workers( )[-len(self.workers.remote_workers()) // 3:] num_replay_buffer_shards = self.config["optimizer"][ "num_replay_buffer_shards"] # Create copy here so that we can modify without breaking other logic replay_actor_config = copy.deepcopy( self.config["replay_buffer_config"]) replay_actor_config["capacity"] = ( self.config["replay_buffer_config"]["capacity"] // num_replay_buffer_shards) ReplayActor = ray.remote(num_cpus=0)(replay_actor_config["type"]) # Place all replay buffer shards on the same node as the learner # (driver process that runs this execution plan). if replay_actor_config["replay_buffer_shards_colocated_with_driver"]: self._replay_actors = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count) ( ReplayActor, None, replay_actor_config, num_replay_buffer_shards, ) ], node=platform.node(), # localhost )[0] # [0]=only one item in `actor_specs`. # Place replay buffer shards on any node(s). else: self._replay_actors = [ ReplayActor.remote(*replay_actor_config) for _ in range(num_replay_buffer_shards) ] self._replay_actor_manager = AsyncRequestsManager( self._replay_actors, max_remote_requests_in_flight_per_worker=self. config["max_requests_in_flight_per_replay_worker"], ray_wait_timeout_s=self.config["timeout_s_replay_manager"], ) self._sampling_actor_manager = AsyncRequestsManager( self.workers.remote_workers(), max_remote_requests_in_flight_per_worker=self. config["max_requests_in_flight_per_sampler_worker"], ray_wait_timeout_s=self.config["timeout_s_sampler_manager"], ) self.learner_thread = LearnerThread(self.workers.local_worker()) self.learner_thread.start() self.steps_since_update = defaultdict(int) weights = self.workers.local_worker().get_weights() self.curr_learner_weights = ray.put(weights) self.curr_num_samples_collected = 0 self.replay_sample_batches = [] self._num_ts_trained_since_last_target_update = 0
def setup(self, config: PartialTrainerConfigDict): super().setup(config) self._worker_manager = AsyncRequestsManager( self.workers.remote_workers(), max_remote_requests_in_flight_per_worker=1)
def setup(self, config: PartialAlgorithmConfigDict): # Call super's setup to validate config, create RolloutWorkers # (train and eval), etc.. num_gpus_saved = config["num_gpus"] config["num_gpus"] = min(config["num_gpus"], 1) super().setup(config) self.config["num_gpus"] = num_gpus_saved # - Create n policy learner actors (@ray.remote-converted Policies) on # one or more GPU nodes. # - On each such node, also locate one replay buffer shard. ma_cfg = self.config["multiagent"] # By default, set max_num_policies_to_train to the number of policy IDs # provided in the multiagent config. if self.config["max_num_policies_to_train"] is None: self.config["max_num_policies_to_train"] = len( self.workers.local_worker().get_policies_to_train()) # Single CPU replay shard (co-located with GPUs so we can place the # policies on the same machine(s)). num_gpus = (0.01 if (self.config["num_gpus"] and not self.config["_fake_gpus"]) else 0) ReplayActor = ray.remote( num_cpus=1, num_gpus=num_gpus, )(MixInMultiAgentReplayBuffer) # Setup remote replay buffer shards and policy learner actors # (located on any GPU machine in the cluster): replay_actor_args = [ self.config["replay_buffer_capacity"], self.config["replay_buffer_replay_ratio"], ] # Create a DistributedLearners utility object and set it up with # the initial first n learnable policies (found in the config). distributed_learners = DistributedLearners( config=self.config, max_num_policies_to_train=self.config["max_num_policies_to_train"], replay_actor_class=ReplayActor, replay_actor_args=replay_actor_args, ) for pid, policy_spec in ma_cfg["policies"].items(): if pid in self.workers.local_worker().get_policies_to_train(): distributed_learners.add_policy(pid, policy_spec) # Store distributed_learners on all RolloutWorkers # so they know, to which replay shard to send samples to. def _set_policy_learners(worker): worker._distributed_learners = distributed_learners ray.get([ w.apply.remote(_set_policy_learners) for w in self.workers.remote_workers() ]) self.distributed_learners = distributed_learners self._sampling_actor_manager = AsyncRequestsManager( self.workers.remote_workers(), max_remote_requests_in_flight_per_worker=self. config["max_requests_in_flight_per_sampler_worker"], ray_wait_timeout_s=self.config["timeout_s_sampler_manager"], ) policy_actors = [ policy_actor for _, policy_actor, _ in distributed_learners ] self._learner_worker_manager = AsyncRequestsManager( workers=policy_actors, max_remote_requests_in_flight_per_worker=self. config["max_requests_in_flight_per_learner_worker"], ray_wait_timeout_s=self.config["timeout_s_learner_manager"], )