def sync_ensemble(workers: WorkerSet) -> None: """Syncs dynamics ensemble weights from driver (main) to workers. Args: workers: Set of workers, including driver (main). """ def get_ensemble_weights(worker): policy_map = worker.policy_map policies = policy_map.keys() def policy_ensemble_weights(policy): model = policy.dynamics_model return { k: v.cpu().detach().numpy() for k, v in model.state_dict().items() } return { pid: policy_ensemble_weights(policy) for pid, policy in policy_map.items() if pid in policies } def set_ensemble_weights(policy, pid, weights): weights = weights[pid] weights = convert_to_torch_tensor(weights, device=policy.device) model = policy.dynamics_model model.load_state_dict(weights) if workers.remote_workers(): weights = ray.put(get_ensemble_weights(workers.local_worker())) set_func = ray.put(set_ensemble_weights) for e in workers.remote_workers(): e.foreach_policy.remote(set_func, weights=weights)
def ParallelRollouts(workers: WorkerSet, mode="bulk_sync") -> LocalIterator[SampleBatch]: """Operator to collect experiences in parallel from rollout workers. If there are no remote workers, experiences will be collected serially from the local worker instance instead. Arguments: workers (WorkerSet): set of rollout workers to use. mode (str): One of {'async', 'bulk_sync'}. - In 'async' mode, batches are returned as soon as they are computed by rollout workers with no order guarantees. - In 'bulk_sync' mode, we collect one batch from each worker and concatenate them together into a large batch to return. Returns: A local iterator over experiences collected in parallel. Examples: >>> rollouts = ParallelRollouts(workers, mode="async") >>> batch = next(rollouts) >>> print(batch.count) 50 # config.sample_batch_size >>> rollouts = ParallelRollouts(workers, mode="bulk_sync") >>> batch = next(rollouts) >>> print(batch.count) 200 # config.sample_batch_size * config.num_workers Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context. """ def report_timesteps(batch): metrics = LocalIterator.get_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count return batch if not workers.remote_workers(): # Handle the serial sampling case. def sampler(_): while True: yield workers.local_worker().sample() return (LocalIterator(sampler, MetricsContext()).for_each(report_timesteps)) # Create a parallel iterator over generated experiences. rollouts = from_actors(workers.remote_workers()) if mode == "bulk_sync": return rollouts \ .batch_across_shards() \ .for_each(lambda batches: SampleBatch.concat_samples(batches)) \ .for_each(report_timesteps) elif mode == "async": return rollouts.gather_async().for_each(report_timesteps) else: raise ValueError( "mode must be one of 'bulk_sync', 'async', got '{}'".format(mode))
def synchronous_parallel_sample(workers: WorkerSet) -> List[SampleBatch]: # No remote workers in the set -> Use local worker for collecting # samples. if not workers.remote_workers(): return [workers.local_worker().sample()] # Loop over remote workers' `sample()` method in parallel. sample_batches = ray.get( [r.sample.remote() for r in workers.remote_workers()]) return sample_batches
def sync_stats(workers: WorkerSet) -> None: def get_normalizations(worker): policy = worker.policy_map[DEFAULT_POLICY_ID] return policy.dynamics_model.normalizations def set_normalizations(policy, pid, normalizations): policy.dynamics_model.set_norms(normalizations) if workers.remote_workers(): normalization_dict = ray.put(get_normalizations( workers.local_worker())) set_func = ray.put(set_normalizations) for e in workers.remote_workers(): e.foreach_policy.remote(set_func, normalizations=normalization_dict)
def synchronous_parallel_sample( worker_set: WorkerSet, remote_fn: Optional[Callable[["RolloutWorker"], None]] = None, ) -> List[SampleBatch]: """Runs parallel and synchronous rollouts on all remote workers. Waits for all workers to return from the remote calls. If no remote workers exist (num_workers == 0), use the local worker for sampling. Alternatively to calling `worker.sample.remote()`, the user can provide a `remote_fn()`, which will be applied to the worker(s) instead. Args: worker_set: The WorkerSet to use for sampling. remote_fn: If provided, use `worker.apply.remote(remote_fn)` instead of `worker.sample.remote()` to generate the requests. Returns: The list of collected sample batch types (one for each parallel rollout worker in the given `worker_set`). Examples: >>> # Define an RLlib trainer. >>> trainer = ... # doctest: +SKIP >>> # 2 remote workers (num_workers=2): >>> batches = synchronous_parallel_sample(trainer.workers) # doctest: +SKIP >>> print(len(batches)) # doctest: +SKIP 2 >>> print(batches[0]) # doctest: +SKIP SampleBatch(16: ['obs', 'actions', 'rewards', 'dones']) >>> # 0 remote workers (num_workers=0): Using the local worker. >>> batches = synchronous_parallel_sample(trainer.workers) # doctest: +SKIP >>> print(len(batches)) # doctest: +SKIP 1 """ # No remote workers in the set -> Use local worker for collecting # samples. if not worker_set.remote_workers(): return [worker_set.local_worker().sample()] # Loop over remote workers' `sample()` method in parallel. sample_batches = ray.get( [r.sample.remote() for r in worker_set.remote_workers()]) # Return all collected batches. return sample_batches
def gather_experiences_tree_aggregation(workers: WorkerSet, config: Dict) -> "LocalIterator[Any]": """Tree aggregation version of gather_experiences_directly().""" rollouts = ParallelRollouts(workers, mode="raw") # Divide up the workers between aggregators. worker_assignments = [[] for _ in range(config["num_aggregation_workers"])] i = 0 for w in range(len(workers.remote_workers())): worker_assignments[i].append(w) i += 1 i %= len(worker_assignments) logger.info("Worker assignments: {}".format(worker_assignments)) # Create parallel iterators that represent each aggregation group. rollout_groups: List["ParallelIterator[SampleBatchType]"] = [ rollouts.select_shards(assigned) for assigned in worker_assignments ] # This spawns |num_aggregation_workers| intermediate actors that aggregate # experiences in parallel. We force colocation on the same node to maximize # data bandwidth between them and the driver. train_batches = from_actors([ create_colocated(Aggregator, [config, g], 1)[0] for g in rollout_groups ]) # TODO(ekl) properly account for replay. def record_steps_sampled(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count return batch return train_batches.gather_async().for_each(record_steps_sampled)
def LocalComputeUpdates(workers: WorkerSet, significance_threshold): rollouts = from_actors(workers.remote_workers()) def train_on_batch(samples): if isinstance(samples, SampleBatch): samples = MultiAgentBatch({DEFAULT_POLICY_ID: samples}, samples.count) worker = get_global_worker() if not hasattr(worker, 'num_iterations_trained'): worker.num_iterations_trained = 0 info = worker.learn_on_batch(samples) worker.foreach_trainable_policy( lambda p, pid: p.asp_accumulate_grads()) worker.num_iterations_trained += 1 info['num_iterations_trained'] = worker.num_iterations_trained updates = { pid: worker.get_policy(pid).asp_get_updates(significance_threshold) for pid in worker.policies_to_train } return updates, info, samples.count, 1 res = rollouts.for_each(train_on_batch) return res
def LocalTrainOneStep(workers: WorkerSet, num_sgd_iter: int = 1, sgd_minibatch_size: int = 0): rollouts = from_actors(workers.remote_workers()) def train_on_batch(samples): if isinstance(samples, SampleBatch): samples = MultiAgentBatch({DEFAULT_POLICY_ID: samples}, samples.count) worker = get_global_worker() if not hasattr(worker, 'num_iterations_trained'): worker.num_iterations_trained = 0 if num_sgd_iter > 1: info = do_minibatch_sgd(samples, { pid: worker.get_policy(pid) for pid in worker.policies_to_train }, worker, num_sgd_iter, sgd_minibatch_size, []) else: info = worker.learn_on_batch(samples) worker.num_iterations_trained += 1 info['num_iterations_trained'] = worker.num_iterations_trained return info, samples.count, num_sgd_iter info = rollouts.for_each(train_on_batch) return info
def inner_adaptation(workers: WorkerSet, samples: List[SampleBatch]): """Performs one gradient descend step on each remote worker. Args: workers: The WorkerSet of the Algorithm. samples (List[SampleBatch]): The list of SampleBatches to perform a training step on (one for each remote worker). """ for i, e in enumerate(workers.remote_workers()): e.learn_on_batch.remote(samples[i])
def gather_experiences_tree_aggregation(workers: WorkerSet, config: Dict) -> "LocalIterator[Any]": """Tree aggregation version of gather_experiences_directly().""" rollouts = ParallelRollouts(workers, mode="raw") # Divide up the workers between aggregators. worker_assignments = [[] for _ in range(config["num_aggregation_workers"])] i = 0 for worker_idx in range(len(workers.remote_workers())): worker_assignments[i].append(worker_idx) i += 1 i %= len(worker_assignments) logger.info("Worker assignments: {}".format(worker_assignments)) # Create parallel iterators that represent each aggregation group. rollout_groups: List["ParallelIterator[SampleBatchType]"] = [ rollouts.select_shards(assigned) for assigned in worker_assignments ] # This spawns |num_aggregation_workers| intermediate actors that aggregate # experiences in parallel. We force colocation on the same node (localhost) # to maximize data bandwidth between them and the driver. localhost = platform.node() assert localhost != "", ("ERROR: Cannot determine local node name! " "`platform.node()` returned empty string.") all_co_located = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count=1) (Aggregator, [config, g], {}, 1) for g in rollout_groups ], node=localhost, ) # Use the first ([0]) of each created group (each group only has one # actor: count=1). train_batches = from_actors([group[0] for group in all_co_located]) # TODO(ekl) properly account for replay. def record_steps_sampled(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count if isinstance(batch, MultiAgentBatch): metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.agent_steps( ) else: metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.count return batch return train_batches.gather_async().for_each(record_steps_sampled)
def AsyncGradients( workers: WorkerSet) -> LocalIterator[Tuple[ModelGradients, int]]: """Operator to compute gradients in parallel from rollout workers. Args: workers (WorkerSet): set of rollout workers to use. Returns: A local iterator over policy gradients computed on rollout workers. Examples: >>> from ray.rllib.execution.rollout_ops import AsyncGradients >>> workers = ... # doctest: +SKIP >>> grads_op = AsyncGradients(workers) # doctest: +SKIP >>> print(next(grads_op)) # doctest: +SKIP {"var_0": ..., ...}, 50 # grads, batch count Updates the STEPS_SAMPLED_COUNTER counter and LEARNER_INFO field in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() # This function will be applied remotely on the workers. def samples_to_grads(samples): return get_global_worker().compute_gradients(samples), samples.count # Record learner metrics and pass through (grads, count). class record_metrics: def _on_fetch_start(self): self.fetch_start_time = time.perf_counter() def __call__(self, item): (grads, info), count = item metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += count metrics.info[LEARNER_INFO] = ({ DEFAULT_POLICY_ID: info } if LEARNER_STATS_KEY in info else info) metrics.timers[GRAD_WAIT_TIMER].push(time.perf_counter() - self.fetch_start_time) return grads, count rollouts = from_actors(workers.remote_workers()) grads = rollouts.for_each(samples_to_grads) return grads.gather_async().for_each(record_metrics())
def AsyncGradients( workers: WorkerSet) -> LocalIterator[Tuple[GradientType, int]]: """Operator to compute gradients in parallel from rollout workers. Arguments: workers (WorkerSet): set of rollout workers to use. Returns: A local iterator over policy gradients computed on rollout workers. Examples: >>> grads_op = AsyncGradients(workers) >>> print(next(grads_op)) {"var_0": ..., ...}, 50 # grads, batch count Updates the STEPS_SAMPLED_COUNTER counter and LEARNER_INFO field in the local iterator context. """ # This function will be applied remotely on the workers. def samples_to_grads(samples): return get_global_worker().compute_gradients(samples), samples.count # Record learner metrics and pass through (grads, count). class record_metrics: def _on_fetch_start(self): self.fetch_start_time = time.perf_counter() def __call__(self, item): (grads, info), count = item metrics = LocalIterator.get_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += count metrics.info[LEARNER_INFO] = get_learner_stats(info) metrics.timers[GRAD_WAIT_TIMER].push(time.perf_counter() - self.fetch_start_time) return grads, count rollouts = from_actors(workers.remote_workers()) grads = rollouts.for_each(samples_to_grads) return grads.gather_async().for_each(record_metrics())
def ParallelRollouts(workers: WorkerSet, *, mode="bulk_sync", num_async=1) -> LocalIterator[SampleBatch]: """Operator to collect experiences in parallel from rollout workers. If there are no remote workers, experiences will be collected serially from the local worker instance instead. Arguments: workers (WorkerSet): set of rollout workers to use. mode (str): One of {'async', 'bulk_sync', 'raw'}. - In 'async' mode, batches are returned as soon as they are computed by rollout workers with no order guarantees. - In 'bulk_sync' mode, we collect one batch from each worker and concatenate them together into a large batch to return. - In 'raw' mode, the ParallelIterator object is returned directly and the caller is responsible for implementing gather and updating the timesteps counter. num_async (int): In async mode, the max number of async requests in flight per actor. Returns: A local iterator over experiences collected in parallel. Examples: >>> rollouts = ParallelRollouts(workers, mode="async") >>> batch = next(rollouts) >>> print(batch.count) 50 # config.rollout_fragment_length >>> rollouts = ParallelRollouts(workers, mode="bulk_sync") >>> batch = next(rollouts) >>> print(batch.count) 200 # config.rollout_fragment_length * config.num_workers Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() def report_timesteps(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count return batch if not workers.remote_workers(): # Handle the serial sampling case. def sampler(_): while True: yield workers.local_worker().sample() return (LocalIterator(sampler, SharedMetrics()).for_each(report_timesteps)) # Create a parallel iterator over generated experiences. rollouts = from_actors(workers.remote_workers()) if mode == "bulk_sync": return rollouts \ .batch_across_shards() \ .for_each(lambda batches: SampleBatch.concat_samples(batches)) \ .for_each(report_timesteps) elif mode == "async": return rollouts.gather_async( num_async=num_async).for_each(report_timesteps) elif mode == "raw": return rollouts else: raise ValueError("mode must be one of 'bulk_sync', 'async', 'raw', " "got '{}'".format(mode))
def apex_execution_plan(workers: WorkerSet, config: dict) -> LocalIterator[dict]: # Create a number of replay buffer actors. num_replay_buffer_shards = config["optimizer"]["num_replay_buffer_shards"] replay_actors = create_colocated(ReplayActor, [ num_replay_buffer_shards, config["learning_starts"], config["buffer_size"], config["train_batch_size"], config["prioritized_replay_alpha"], config["prioritized_replay_beta"], config["prioritized_replay_eps"], config["multiagent"]["replay_mode"], config.get("replay_sequence_length", 1), ], num_replay_buffer_shards) # Start the learner thread. learner_thread = LearnerThread(workers.local_worker()) learner_thread.start() # Update experience priorities post learning. def update_prio_and_stats(item: Tuple["ActorHandle", dict, int]) -> None: actor, prio_dict, count = item actor.update_priorities.remote(prio_dict) metrics = _get_shared_metrics() # Manually update the steps trained counter since the learner thread # is executing outside the pipeline. metrics.counters[STEPS_TRAINED_COUNTER] += count metrics.timers["learner_dequeue"] = learner_thread.queue_timer metrics.timers["learner_grad"] = learner_thread.grad_timer metrics.timers["learner_overall"] = learner_thread.overall_timer # We execute the following steps concurrently: # (1) Generate rollouts and store them in our replay buffer actors. Update # the weights of the worker that generated the batch. rollouts = ParallelRollouts(workers, mode="async", num_async=2) store_op = rollouts \ .for_each(StoreToReplayBuffer(actors=replay_actors)) # Only need to update workers if there are remote workers. if workers.remote_workers(): store_op = store_op.zip_with_source_actor() \ .for_each(UpdateWorkerWeights( learner_thread, workers, max_weight_sync_delay=( config["optimizer"]["max_weight_sync_delay"]) )) # (2) Read experiences from the replay buffer actors and send to the # learner thread via its in-queue. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) replay_op = Replay(actors=replay_actors, num_async=4) \ .for_each(lambda x: post_fn(x, workers, config)) \ .zip_with_source_actor() \ .for_each(Enqueue(learner_thread.inqueue)) # (3) Get priorities back from learner thread and apply them to the # replay buffer actors. update_op = Dequeue( learner_thread.outqueue, check=learner_thread.is_alive) \ .for_each(update_prio_and_stats) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"], by_steps_trained=True)) if config["training_intensity"]: # Execute (1), (2) with a fixed intensity ratio. rr_weights = calculate_rr_weights(config) + ["*"] merged_op = Concurrently( [store_op, replay_op, update_op], mode="round_robin", output_indexes=[2], round_robin_weights=rr_weights) else: # Execute (1), (2), (3) asynchronously as fast as possible. Only output # items from (3) since metrics aren't available before then. merged_op = Concurrently( [store_op, replay_op, update_op], mode="async", output_indexes=[2]) # Add in extra replay and learner metrics to the training result. def add_apex_metrics(result: dict) -> dict: replay_stats = ray.get(replay_actors[0].stats.remote( config["optimizer"].get("debug"))) exploration_infos = workers.foreach_trainable_policy( lambda p, _: p.get_exploration_info()) result["info"].update({ "exploration_infos": exploration_infos, "learner_queue": learner_thread.learner_queue_size.stats(), "learner": copy.deepcopy(learner_thread.stats), "replay_shard_0": replay_stats, }) return result # Only report metrics from the workers with the lowest 1/3 of epsilons. selected_workers = workers.remote_workers()[ -len(workers.remote_workers()) // 3:] return StandardMetricsReporting( merged_op, workers, config, selected_workers=selected_workers).for_each(add_apex_metrics)
def execution_plan(workers: WorkerSet, config: AlgorithmConfigDict, **kwargs) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "MBMPO execution_plan does NOT take any additional parameters" # Train TD Models on the driver. workers.local_worker().foreach_policy(fit_dynamics) # Sync driver's policy with workers. workers.sync_weights() # Sync TD Models and normalization stats with workers sync_ensemble(workers) sync_stats(workers) # Dropping metrics from the first iteration _, _ = collect_episodes(workers.local_worker(), workers.remote_workers(), [], timeout_seconds=9999) # Metrics Collector. metric_collect = CollectMetrics( workers, min_history=0, timeout_seconds=config["metrics_episode_collection_timeout_s"], ) num_inner_steps = config["inner_adaptation_steps"] def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: print("Collecting Samples, Inner Adaptation {}".format( len(split))) # Processing Samples (Standardize Advantages) samples, split_lst = post_process_samples(samples, config) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 prefix = "DynaTrajInner_" + str(adapt_iter) metrics = post_process_metrics(prefix, workers, metrics) if len(split) > num_inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] yield out, metrics metrics = {} else: inner_adaptation(workers, samples) # Iterator for Inner Adaptation Data gathering (from pre->post # adaptation). rollouts = from_actors(workers.remote_workers()) rollouts = rollouts.batch_across_shards() rollouts = rollouts.transform(inner_adaptation_steps) # Meta update step with outer combine loop for multiple MAML # iterations. train_op = rollouts.combine( MetaUpdate( workers, config["num_maml_steps"], config["maml_optimizer_steps"], metric_collect, )) return train_op
def execution_plan(workers: WorkerSet, config: dict, **kwargs) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "Apex execution_plan does NOT take any additional parameters" # Create a number of replay buffer actors. num_replay_buffer_shards = config["optimizer"][ "num_replay_buffer_shards"] buffer_size = (config["replay_buffer_config"]["capacity"] // num_replay_buffer_shards) replay_actor_args = [ num_replay_buffer_shards, config["learning_starts"], buffer_size, config["train_batch_size"], config["replay_buffer_config"]["prioritized_replay_alpha"], config["replay_buffer_config"]["prioritized_replay_beta"], config["replay_buffer_config"]["prioritized_replay_eps"], config["multiagent"]["replay_mode"], config["replay_buffer_config"].get("replay_sequence_length", 1), ] # Place all replay buffer shards on the same node as the learner # (driver process that runs this execution plan). if config["replay_buffer_shards_colocated_with_driver"]: replay_actors = create_colocated_actors( actor_specs=[ # (class, args, kwargs={}, count) (ReplayActor, replay_actor_args, {}, num_replay_buffer_shards) ], node=platform.node(), # localhost )[0] # [0]=only one item in `actor_specs`. # Place replay buffer shards on any node(s). else: replay_actors = [ ReplayActor(*replay_actor_args) for _ in range(num_replay_buffer_shards) ] # Start the learner thread. learner_thread = LearnerThread(workers.local_worker()) learner_thread.start() # Update experience priorities post learning. def update_prio_and_stats( item: Tuple[ActorHandle, dict, int, int]) -> None: actor, prio_dict, env_count, agent_count = item if config.get("prioritized_replay"): actor.update_priorities.remote(prio_dict) metrics = _get_shared_metrics() # Manually update the steps trained counter since the learner # thread is executing outside the pipeline. metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = env_count metrics.counters[STEPS_TRAINED_COUNTER] += env_count metrics.timers["learner_dequeue"] = learner_thread.queue_timer metrics.timers["learner_grad"] = learner_thread.grad_timer metrics.timers["learner_overall"] = learner_thread.overall_timer # We execute the following steps concurrently: # (1) Generate rollouts and store them in one of our replay buffer # actors. Update the weights of the worker that generated the batch. rollouts = ParallelRollouts(workers, mode="async", num_async=2) store_op = rollouts.for_each(StoreToReplayBuffer(actors=replay_actors)) # Only need to update workers if there are remote workers. if workers.remote_workers(): store_op = store_op.zip_with_source_actor().for_each( UpdateWorkerWeights( learner_thread, workers, max_weight_sync_delay=( config["optimizer"]["max_weight_sync_delay"]), )) # (2) Read experiences from one of the replay buffer actors and send # to the learner thread via its in-queue. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) replay_op = (Replay( actors=replay_actors, num_async=4).for_each(lambda x: post_fn( x, workers, config)).zip_with_source_actor().for_each( Enqueue(learner_thread.inqueue))) # (3) Get priorities back from learner thread and apply them to the # replay buffer actors. update_op = (Dequeue(learner_thread.outqueue, check=learner_thread.is_alive).for_each( update_prio_and_stats).for_each( UpdateTargetNetwork( workers, config["target_network_update_freq"], by_steps_trained=True))) if config["training_intensity"]: # Execute (1), (2) with a fixed intensity ratio. rr_weights = calculate_rr_weights(config) + ["*"] merged_op = Concurrently( [store_op, replay_op, update_op], mode="round_robin", output_indexes=[2], round_robin_weights=rr_weights, ) else: # Execute (1), (2), (3) asynchronously as fast as possible. Only # output items from (3) since metrics aren't available before # then. merged_op = Concurrently([store_op, replay_op, update_op], mode="async", output_indexes=[2]) # Add in extra replay and learner metrics to the training result. def add_apex_metrics(result: dict) -> dict: replay_stats = ray.get(replay_actors[0].stats.remote( config["optimizer"].get("debug"))) exploration_infos = workers.foreach_policy_to_train( lambda p, _: p.get_exploration_state()) result["info"].update({ "exploration_infos": exploration_infos, "learner_queue": learner_thread.learner_queue_size.stats(), LEARNER_INFO: copy.deepcopy(learner_thread.learner_info), "replay_shard_0": replay_stats, }) return result # Only report metrics from the workers with the lowest 1/3 of # epsilons. selected_workers = workers.remote_workers( )[-len(workers.remote_workers()) // 3:] return StandardMetricsReporting( merged_op, workers, config, selected_workers=selected_workers).for_each(add_apex_metrics)
def ParallelRollouts(workers: WorkerSet, *, mode="bulk_sync", num_async=1) -> LocalIterator[SampleBatch]: """Operator to collect experiences in parallel from rollout workers. If there are no remote workers, experiences will be collected serially from the local worker instance instead. Args: workers (WorkerSet): set of rollout workers to use. mode (str): One of 'async', 'bulk_sync', 'raw'. In 'async' mode, batches are returned as soon as they are computed by rollout workers with no order guarantees. In 'bulk_sync' mode, we collect one batch from each worker and concatenate them together into a large batch to return. In 'raw' mode, the ParallelIterator object is returned directly and the caller is responsible for implementing gather and updating the timesteps counter. num_async (int): In async mode, the max number of async requests in flight per actor. Returns: A local iterator over experiences collected in parallel. Examples: >>> from ray.rllib.execution import ParallelRollouts >>> workers = ... # doctest: +SKIP >>> rollouts = ParallelRollouts(workers, mode="async") # doctest: +SKIP >>> batch = next(rollouts) # doctest: +SKIP >>> print(batch.count) # doctest: +SKIP 50 # config.rollout_fragment_length >>> rollouts = ParallelRollouts(workers, mode="bulk_sync") # doctest: +SKIP >>> batch = next(rollouts) # doctest: +SKIP >>> print(batch.count) # doctest: +SKIP 200 # config.rollout_fragment_length * config.num_workers Updates the STEPS_SAMPLED_COUNTER counter in the local iterator context. """ # Ensure workers are initially in sync. workers.sync_weights() def report_timesteps(batch): metrics = _get_shared_metrics() metrics.counters[STEPS_SAMPLED_COUNTER] += batch.count if isinstance(batch, MultiAgentBatch): metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.agent_steps( ) else: metrics.counters[AGENT_STEPS_SAMPLED_COUNTER] += batch.count return batch if not workers.remote_workers(): # Handle the `num_workers=0` case, in which the local worker # has to do sampling as well. return LocalIterator( lambda timeout: workers.local_worker().item_generator, SharedMetrics()).for_each(report_timesteps) # Create a parallel iterator over generated experiences. rollouts = from_actors(workers.remote_workers()) if mode == "bulk_sync": return (rollouts.batch_across_shards().for_each( lambda batches: SampleBatch.concat_samples(batches)).for_each( report_timesteps)) elif mode == "async": return rollouts.gather_async( num_async=num_async).for_each(report_timesteps) elif mode == "raw": return rollouts else: raise ValueError( "mode must be one of 'bulk_sync', 'async', 'raw', got '{}'".format( mode))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the DD-PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PGTrainer. If None, use `default_policy` provided in build_trainer(). """ assert len(kwargs) == 0, ( "DDPPO execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="raw") # Setup the distributed processes. if not workers.remote_workers(): raise ValueError("This optimizer requires >0 remote workers.") ip = ray.get(workers.remote_workers()[0].get_node_ip.remote()) port = ray.get(workers.remote_workers()[0].find_free_port.remote()) address = "tcp://{ip}:{port}".format(ip=ip, port=port) logger.info( "Creating torch process group with leader {}".format(address)) # Get setup tasks in order to throw errors on failure. ray.get([ worker.setup_torch_data_parallel.remote( url=address, world_rank=i, world_size=len(workers.remote_workers()), backend=config["torch_distributed_backend"]) for i, worker in enumerate(workers.remote_workers()) ]) logger.info("Torch process group init completed") # This function is applied remotely on each rollout worker. def train_torch_distributed_allreduce(batch): expected_batch_size = (config["rollout_fragment_length"] * config["num_envs_per_worker"]) this_worker = get_global_worker() assert batch.count == expected_batch_size, \ ("Batch size possibly out of sync between workers, expected:", expected_batch_size, "got:", batch.count) logger.info("Executing distributed minibatch SGD " "with epoch size {}, minibatch size {}".format( batch.count, config["sgd_minibatch_size"])) info = do_minibatch_sgd(batch, this_worker.policy_map, this_worker, config["num_sgd_iter"], config["sgd_minibatch_size"], ["advantages"]) return info, batch.count # Broadcast the local set of global vars. def update_worker_global_vars(item): global_vars = _get_global_vars() for w in workers.remote_workers(): w.set_global_vars.remote(global_vars) return item # Have to manually record stats since we are using "raw" rollouts mode. class RecordStats: def _on_fetch_start(self): self.fetch_start_time = time.perf_counter() def __call__(self, items): for item in items: info, count = item metrics = _get_shared_metrics() metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = count metrics.counters[STEPS_SAMPLED_COUNTER] += count metrics.counters[STEPS_TRAINED_COUNTER] += count metrics.info[LEARNER_INFO] = info # Since SGD happens remotely, the time delay between fetch and # completion is approximately the SGD step time. metrics.timers[LEARN_ON_BATCH_TIMER].push( time.perf_counter() - self.fetch_start_time) train_op = ( rollouts.for_each(train_torch_distributed_allreduce) # allreduce .batch_across_shards() # List[(grad_info, count)] .for_each(RecordStats())) train_op = train_op.for_each(update_worker_global_vars) # Sync down the weights. As with the sync up, this is not really # needed unless the user is reading the local weights. if config["keep_local_weights_in_sync"]: def download_weights(item): workers.local_worker().set_weights( ray.get(workers.remote_workers()[0].get_weights.remote())) return item train_op = train_op.for_each(download_weights) # In debug mode, check the allreduce successfully synced the weights. if logger.isEnabledFor(logging.DEBUG): def check_sync(item): weights = ray.get( [w.get_weights.remote() for w in workers.remote_workers()]) sums = [] for w in weights: acc = 0 for p in w.values(): for k, v in p.items(): acc += v.sum() sums.append(float(acc)) logger.debug("The worker weight sums are {}".format(sums)) assert len(set(sums)) == 1, sums train_op = train_op.for_each(check_sync) return StandardMetricsReporting(train_op, workers, config)
def synchronous_parallel_sample( *, worker_set: WorkerSet, max_agent_steps: Optional[int] = None, max_env_steps: Optional[int] = None, concat: bool = True, ) -> Union[List[SampleBatchType], SampleBatchType]: """Runs parallel and synchronous rollouts on all remote workers. Waits for all workers to return from the remote calls. If no remote workers exist (num_workers == 0), use the local worker for sampling. Alternatively to calling `worker.sample.remote()`, the user can provide a `remote_fn()`, which will be applied to the worker(s) instead. Args: worker_set: The WorkerSet to use for sampling. remote_fn: If provided, use `worker.apply.remote(remote_fn)` instead of `worker.sample.remote()` to generate the requests. max_agent_steps: Optional number of agent steps to be included in the final batch. max_env_steps: Optional number of environment steps to be included in the final batch. concat: Whether to concat all resulting batches at the end and return the concat'd batch. Returns: The list of collected sample batch types (one for each parallel rollout worker in the given `worker_set`). Examples: >>> # Define an RLlib trainer. >>> trainer = ... # doctest: +SKIP >>> # 2 remote workers (num_workers=2): >>> batches = synchronous_parallel_sample(trainer.workers) # doctest: +SKIP >>> print(len(batches)) # doctest: +SKIP 2 >>> print(batches[0]) # doctest: +SKIP SampleBatch(16: ['obs', 'actions', 'rewards', 'dones']) >>> # 0 remote workers (num_workers=0): Using the local worker. >>> batches = synchronous_parallel_sample(trainer.workers) # doctest: +SKIP >>> print(len(batches)) # doctest: +SKIP 1 """ # Only allow one of `max_agent_steps` or `max_env_steps` to be defined. assert not (max_agent_steps is not None and max_env_steps is not None) agent_or_env_steps = 0 max_agent_or_env_steps = max_agent_steps or max_env_steps or None all_sample_batches = [] # Stop collecting batches as soon as one criterium is met. while (max_agent_or_env_steps is None and agent_or_env_steps == 0) or (max_agent_or_env_steps is not None and agent_or_env_steps < max_agent_or_env_steps): # No remote workers in the set -> Use local worker for collecting # samples. if not worker_set.remote_workers(): sample_batches = [worker_set.local_worker().sample()] # Loop over remote workers' `sample()` method in parallel. else: sample_batches = ray.get([ worker.sample.remote() for worker in worker_set.remote_workers() ]) # Update our counters for the stopping criterion of the while loop. for b in sample_batches: if max_agent_steps: agent_or_env_steps += b.agent_steps() else: agent_or_env_steps += b.env_steps() all_sample_batches.extend(sample_batches) if concat is True: full_batch = SampleBatch.concat_samples(all_sample_batches) # Discard collected incomplete episodes in episode mode. # if max_episodes is not None and episodes >= max_episodes: # last_complete_ep_idx = len(full_batch) - full_batch[ # SampleBatch.DONES # ].reverse().index(1) # full_batch = full_batch.slice(0, last_complete_ep_idx) return full_batch else: return all_sample_batches
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert len(kwargs) == 0, ( "MAML execution_plan does NOT take any additional parameters") # Sync workers with meta policy workers.sync_weights() # Samples and sets worker tasks use_meta_env = config["use_meta_env"] set_worker_tasks(workers, use_meta_env) # Metric Collector metric_collect = CollectMetrics( workers, min_history=config["metrics_num_episodes_for_smoothing"], timeout_seconds=config["metrics_episode_collection_timeout_s"]) # Iterator for Inner Adaptation Data gathering (from pre->post # adaptation) inner_steps = config["inner_adaptation_steps"] def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: # Processing Samples (Standardize Advantages) split_lst = [] for sample in samples: sample["advantages"] = standardized(sample["advantages"]) split_lst.append(sample.count) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 metrics = post_process_metrics(adapt_iter, workers, metrics) if len(split) > inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] # Reporting Adaptation Rew Diff ep_rew_pre = metrics["episode_reward_mean"] ep_rew_post = metrics["episode_reward_mean_adapt_" + str(inner_steps)] metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre yield out, metrics metrics = {} else: inner_adaptation(workers, samples) rollouts = from_actors(workers.remote_workers()) rollouts = rollouts.batch_across_shards() rollouts = rollouts.transform(inner_adaptation_steps) # Metaupdate Step train_op = rollouts.for_each( MetaUpdate(workers, config["maml_optimizer_steps"], metric_collect, use_meta_env)) return train_op
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ # Train TD Models on the driver. workers.local_worker().foreach_policy(fit_dynamics) # Sync driver's policy with workers. workers.sync_weights() # Sync TD Models and normalization stats with workers sync_ensemble(workers) sync_stats(workers) # Dropping metrics from the first iteration _, _ = collect_episodes(workers.local_worker(), workers.remote_workers(), [], timeout_seconds=9999) # Metrics Collector. metric_collect = CollectMetrics( workers, min_history=0, timeout_seconds=config["collect_metrics_timeout"]) num_inner_steps = config["inner_adaptation_steps"] def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: print("Collecting Samples, Inner Adaptation {}".format(len(split))) # Processing Samples (Standardize Advantages) samples, split_lst = post_process_samples(samples, config) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 prefix = "DynaTrajInner_" + str(adapt_iter) metrics = post_process_metrics(prefix, workers, metrics) if len(split) > num_inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] yield out, metrics metrics = {} else: inner_adaptation(workers, samples) # Iterator for Inner Adaptation Data gathering (from pre->post adaptation). rollouts = from_actors(workers.remote_workers()) rollouts = rollouts.batch_across_shards() rollouts = rollouts.transform(inner_adaptation_steps) # Meta update step with outer combine loop for multiple MAML iterations. train_op = rollouts.combine( MetaUpdate(workers, config["num_maml_steps"], config["maml_optimizer_steps"], metric_collect)) return train_op