def setup(self, config: PartialTrainerConfigDict): super().setup(config) # Shortcut: If execution_plan, thread and buffer will be created in there. if self.config["_disable_execution_plan_api"] is False: return # Tag those workers (top 1/3rd indices) that we should collect episodes from # for metrics due to `PerWorkerEpsilonGreedy` exploration strategy. if self.workers.remote_workers(): self._remote_workers_for_metrics = self.workers.remote_workers( )[-len(self.workers.remote_workers()) // 3:] num_replay_buffer_shards = self.config["optimizer"][ "num_replay_buffer_shards"] # Create copy here so that we can modify without breaking other logic replay_actor_config = copy.deepcopy( self.config["replay_buffer_config"]) replay_actor_config["capacity"] = ( self.config["replay_buffer_config"]["capacity"] // num_replay_buffer_shards) ReplayActor = ray.remote(num_cpus=0)(replay_actor_config["type"]) # Place all replay buffer shards on the same node as the learner # (driver process that runs this execution plan). if replay_actor_config["replay_buffer_shards_colocated_with_driver"]: self.replay_actors = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count) ( ReplayActor, None, replay_actor_config, num_replay_buffer_shards, ) ], node=platform.node(), # localhost )[0] # [0]=only one item in `actor_specs`. # Place replay buffer shards on any node(s). else: self.replay_actors = [ ReplayActor.remote(*replay_actor_config) for _ in range(num_replay_buffer_shards) ] self.learner_thread = LearnerThread(self.workers.local_worker()) self.learner_thread.start() self.steps_since_update = defaultdict(int) weights = self.workers.local_worker().get_weights() self.curr_learner_weights = ray.put(weights) self.remote_sampling_requests_in_flight: DefaultDict[ ActorHandle, Set[ray.ObjectRef]] = defaultdict(set) self.remote_replay_requests_in_flight: DefaultDict[ ActorHandle, Set[ray.ObjectRef]] = defaultdict(set) self.curr_num_samples_collected = 0 self.replay_sample_batches = [] self._num_ts_trained_since_last_target_update = 0
def setup(self, config: PartialTrainerConfigDict): super().setup(config) self.remote_sampling_requests_in_flight: DefaultDict[ ActorHandle, Set[ray.ObjectRef]] = defaultdict(set) if self.config["_disable_execution_plan_api"]: # Create extra aggregation workers and assign each rollout worker to # one of them. self.batches_to_place_on_learner = [] self.batch_being_built = [] if self.config["num_aggregation_workers"] > 0: # This spawns `num_aggregation_workers` actors that aggregate # experiences coming from RolloutWorkers in parallel. We force # colocation on the same node (localhost) to maximize data bandwidth # between them and the learner. localhost = platform.node() assert localhost != "", ( "ERROR: Cannot determine local node name! " "`platform.node()` returned empty string.") all_co_located = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count=1) ( AggregatorWorker, [ self.config, ], {}, self.config["num_aggregation_workers"], ) ], node=localhost, ) self.aggregator_workers = [ actor for actor_groups in all_co_located for actor in actor_groups ] self.remote_aggregator_requests_in_flight: DefaultDict[ ActorHandle, Set[ray.ObjectRef]] = defaultdict(set) else: # Create our local mixin buffer if the num of aggregation workers is 0. self.local_mixin_buffer = MixInMultiAgentReplayBuffer( capacity=(self.config["replay_buffer_num_slots"] if self.config["replay_buffer_num_slots"] > 0 else 1), replay_ratio=self.config["replay_ratio"], ) # Create and start the learner thread. self._learner_thread = make_learner_thread( self.workers.local_worker(), self.config) self._learner_thread.start() self.workers_that_need_updates = set()
def gather_experiences_tree_aggregation(workers: WorkerSet, config: Dict) -> "LocalIterator[Any]": """Tree aggregation version of gather_experiences_directly().""" rollouts = ParallelRollouts(workers, mode="raw") # Divide up the workers between aggregators. worker_assignments = [[] for _ in range(config["num_aggregation_workers"])] i = 0 for worker_idx in range(len(workers.remote_workers())): worker_assignments[i].append(worker_idx) i += 1 i %= len(worker_assignments) logger.info("Worker assignments: {}".format(worker_assignments)) # Create parallel iterators that represent each aggregation group. rollout_groups: List["ParallelIterator[SampleBatchType]"] = [ rollouts.select_shards(assigned) for assigned in worker_assignments ] # This spawns |num_aggregation_workers| intermediate actors that aggregate # experiences in parallel. We force colocation on the same node (localhost) # to maximize data bandwidth between them and the driver. localhost = platform.node() assert localhost != "", ("ERROR: Cannot determine local node name! " "`platform.node()` returned empty string.") all_co_located = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count=1) (Aggregator, [config, g], {}, 1) for g in rollout_groups ], node=localhost, ) # Use the first ([0]) of each created group (each group only has one # actor: count=1). train_batches = from_actors([group[0] for group in all_co_located]) # TODO(ekl) properly account for replay. def record_steps_sampled(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count if isinstance(batch, MultiAgentBatch): metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.agent_steps( ) else: metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.count return batch return train_batches.gather_async().for_each(record_steps_sampled)
def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec): # Merge the policies config overrides with the main config. # Also, adjust `num_gpus` (to indicate an individual policy's # num_gpus, not the total number of GPUs). cfg = Algorithm.merge_trainer_configs( self.config, dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}), ) # Need to create the replay actor first. Then add the first policy. if self.replay_actor is None: return self._add_replay_buffer_and_policy(policy_id, policy_spec, cfg) # Replay actor already exists -> Just add a new policy here. assert len(self.policy_actors) < self.max_num_policies actual_policy_class = get_tf_eager_cls_if_necessary( policy_spec.policy_class, cfg) colocated = create_colocated_actors( actor_specs=[( ray.remote( num_cpus=1, num_gpus=self.num_gpus_per_policy if not cfg["_fake_gpus"] else 0, )(actual_policy_class), # Policy c'tor args. (policy_spec.observation_space, policy_spec.action_space, cfg), # Policy c'tor kwargs={}. {}, # Count=1, 1, )], # Force co-locate on the already existing replay actor's node. node=ray.get(self.replay_actor.get_host.remote()), ) self.policy_actors[policy_id] = colocated[0][0] return self.policy_actors[policy_id]
def _add_replay_buffer_and_policy( self, policy_id: PolicyID, policy_spec: PolicySpec, config: AlgorithmConfigDict, ): assert self.replay_actor is None assert len(self.policy_actors) == 0 actual_policy_class = get_tf_eager_cls_if_necessary( policy_spec.policy_class, config) colocated = create_colocated_actors( actor_specs=[ (self.replay_actor_class, self.replay_actor_args, {}, 1), ] + [( ray.remote( num_cpus=1, num_gpus=self.num_gpus_per_policy if not config["_fake_gpus"] else 0, )(actual_policy_class), # Policy c'tor args. (policy_spec.observation_space, policy_spec.action_space, config), # Policy c'tor kwargs={}. {}, # Count=1, 1, )], node=None, ) # None self.replay_actor = colocated[0][0] self.policy_actors[policy_id] = colocated[1][0] self.has_replay_buffer = True return self.policy_actors[policy_id]
def execution_plan(workers: WorkerSet, config: dict, **kwargs) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "Apex execution_plan does NOT take any additional parameters" # Create a number of replay buffer actors. num_replay_buffer_shards = config["optimizer"][ "num_replay_buffer_shards"] buffer_size = (config["replay_buffer_config"]["capacity"] // num_replay_buffer_shards) replay_actor_args = [ num_replay_buffer_shards, config["learning_starts"], buffer_size, config["train_batch_size"], config["replay_buffer_config"]["prioritized_replay_alpha"], config["replay_buffer_config"]["prioritized_replay_beta"], config["replay_buffer_config"]["prioritized_replay_eps"], config["multiagent"]["replay_mode"], config["replay_buffer_config"].get("replay_sequence_length", 1), ] # Place all replay buffer shards on the same node as the learner # (driver process that runs this execution plan). if config["replay_buffer_shards_colocated_with_driver"]: replay_actors = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count) (ReplayActor, replay_actor_args, {}, num_replay_buffer_shards) ], node=platform.node(), # localhost )[0] # [0]=only one item in `actor_specs`. # Place replay buffer shards on any node(s). else: replay_actors = [ ReplayActor(*replay_actor_args) for _ in range(num_replay_buffer_shards) ] # Start the learner thread. learner_thread = LearnerThread(workers.local_worker()) learner_thread.start() # Update experience priorities post learning. def update_prio_and_stats( item: Tuple[ActorHandle, dict, int, int]) -> None: actor, prio_dict, env_count, agent_count = item if config.get("prioritized_replay"): actor.update_priorities.remote(prio_dict) metrics = _get_shared_metrics() # Manually update the steps trained counter since the learner # thread is executing outside the pipeline. metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = env_count metrics.counters[STEPS_TRAINED_COUNTER] += env_count metrics.timers["learner_dequeue"] = learner_thread.queue_timer metrics.timers["learner_grad"] = learner_thread.grad_timer metrics.timers["learner_overall"] = learner_thread.overall_timer # We execute the following steps concurrently: # (1) Generate rollouts and store them in one of our replay buffer # actors. Update the weights of the worker that generated the batch. rollouts = ParallelRollouts(workers, mode="async", num_async=2) store_op = rollouts.for_each(StoreToReplayBuffer(actors=replay_actors)) # Only need to update workers if there are remote workers. if workers.remote_workers(): store_op = store_op.zip_with_source_actor().for_each( UpdateWorkerWeights( learner_thread, workers, max_weight_sync_delay=( config["optimizer"]["max_weight_sync_delay"]), )) # (2) Read experiences from one of the replay buffer actors and send # to the learner thread via its in-queue. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) replay_op = (Replay( actors=replay_actors, num_async=4).for_each(lambda x: post_fn( x, workers, config)).zip_with_source_actor().for_each( Enqueue(learner_thread.inqueue))) # (3) Get priorities back from learner thread and apply them to the # replay buffer actors. update_op = (Dequeue(learner_thread.outqueue, check=learner_thread.is_alive).for_each( update_prio_and_stats).for_each( UpdateTargetNetwork( workers, config["target_network_update_freq"], by_steps_trained=True))) if config["training_intensity"]: # Execute (1), (2) with a fixed intensity ratio. rr_weights = calculate_rr_weights(config) + ["*"] merged_op = Concurrently( [store_op, replay_op, update_op], mode="round_robin", output_indexes=[2], round_robin_weights=rr_weights, ) else: # Execute (1), (2), (3) asynchronously as fast as possible. Only # output items from (3) since metrics aren't available before # then. merged_op = Concurrently([store_op, replay_op, update_op], mode="async", output_indexes=[2]) # Add in extra replay and learner metrics to the training result. def add_apex_metrics(result: dict) -> dict: replay_stats = ray.get(replay_actors[0].stats.remote( config["optimizer"].get("debug"))) exploration_infos = workers.foreach_policy_to_train( lambda p, _: p.get_exploration_state()) result["info"].update({ "exploration_infos": exploration_infos, "learner_queue": learner_thread.learner_queue_size.stats(), LEARNER_INFO: copy.deepcopy(learner_thread.learner_info), "replay_shard_0": replay_stats, }) return result # Only report metrics from the workers with the lowest 1/3 of # epsilons. selected_workers = workers.remote_workers( )[-len(workers.remote_workers()) // 3:] return StandardMetricsReporting( merged_op, workers, config, selected_workers=selected_workers).for_each(add_apex_metrics)