def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect large batches of relevant experiences & standardize. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) rollouts = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"])) rollouts = rollouts.for_each(StandardizeFields(["advantages"])) if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"])) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"], framework=config.get("framework"))) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def test_standardize(self): workers = make_workers(0) a = ParallelRollouts(workers, mode="async") b = a.for_each(StandardizeFields([SampleBatch.EPS_ID])) batch = next(b) assert abs(np.mean(batch[SampleBatch.EPS_ID])) < 0.001, batch assert abs(np.std(batch[SampleBatch.EPS_ID]) - 1.0) < 0.001, batch
def test_store_to_replay_actor(self): ReplayActor = ray.remote(num_cpus=0)(MultiAgentReplayBuffer) actor = ReplayActor.remote( num_shards=1, learning_starts=200, capacity=1000, replay_batch_size=100, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=0.0001, ) assert len(ray.get(actor.sample.remote(100))) == 0 workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(StoreToReplayBuffer(actors=[actor])) next(b) assert len(ray.get( actor.sample.remote(100))) == 0 # learning hasn't started next(b) assert ray.get(actor.sample.remote(100)).count == 100 replay_op = Replay(actors=[actor]) assert next(replay_op).count == 100
def execution_plan(workers, config, **kwargs): assert "local_replay_buffer" in kwargs, ( "GenericOffPolicy execution plan requires a local replay buffer.") local_replay_buffer = kwargs["local_replay_buffer"] rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], _fake_gpus=config["_fake_gpus"]) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config, **kwargs): assert ( len(kwargs) == 0 ), "Dreamer execution_plan does NOT take any additional parameters" # Special replay buffer for Dreamer agent. episode_buffer = EpisodicBuffer(length=config["batch_length"]) local_worker = workers.local_worker() # Prefill episode buffer with initial exploration (uniform sampling) while total_sampled_timesteps( local_worker) < config["prefill_timesteps"]: samples = local_worker.sample() episode_buffer.add(samples) batch_size = config["batch_size"] dreamer_train_iters = config["dreamer_train_iters"] act_repeat = config["action_repeat"] rollouts = ParallelRollouts(workers) rollouts = rollouts.for_each( DreamerIteration( local_worker, episode_buffer, dreamer_train_iters, batch_size, act_repeat, )) return rollouts
def execution_plan( workers: WorkerSet, config: TrainerConfigDict, **kwargs ) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "Marwill execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = MultiAgentReplayBuffer( learning_starts=config["learning_starts"], capacity=config["replay_buffer_size"], replay_batch_size=config["train_batch_size"], replay_sequence_length=1, ) store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = ( Replay(local_buffer=replay_buffer) .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ) .for_each(TrainOneStep(workers)) ) train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1] ) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert ( "local_replay_buffer" in kwargs), "SlateQ execution plan requires a local replay buffer." rollouts = ParallelRollouts(workers, mode="bulk_sync") # We execute the following steps concurrently: # (1) Generate rollouts and store them in our local replay buffer. # Calling next() on store_op drives this. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=kwargs["local_replay_buffer"])) # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step. replay_op = (Replay( local_buffer=kwargs["local_replay_buffer"]).for_each( TrainOneStep(workers)).for_each( UpdateTargetNetwork(workers, config["target_network_update_freq"]))) # Alternate deterministically between (1) and (2). Only return the # output of (2) since training metrics are not available until (2) # runs. train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1], round_robin_weights=calculate_round_robin_weights(config), ) return StandardMetricsReporting(train_op, workers, config)
def execution_plan_nfsp(workers, config): # 1. define buffers replay_size = config["replay_buffer_size"] reservoir_size = config["reservoir_buffer_size"] replay_buffers = MultiAgentSimpleReplayBuffer( replay_size, config["multiagent"]["policies"]) reservoir_buffers = MultiAgentReservoirBuffer( reservoir_size, config["multiagent"]["policies"]) rollouts = ParallelRollouts(workers, mode="bulk_sync") # 2. define store operations store_op = rollouts.for_each( StoreToBuffers(replay_buffers, reservoir_buffers, config['multiagent']['policies_to_train'])) # Sampling # 3. define replay/reservoir operations replay_op = SimpleLocalReplayMultiagent(replay_buffers, config["replay_train_batch_size"], config["replay_min_size_to_learn"], config["replay_train_every"]) \ .for_each(TrainOneStep(workers))\ .for_each(UpdateTargetNetwork(workers, config['dqn_policy']["target_network_update_freq"])) reservoir_op = LocalReservoirMultiagent(reservoir_buffers, config["reservoir_train_batch_size"], config["reservoir_min_size_to_learn"], config["reservoir_train_every"])\ .for_each(TrainOneStep(workers)) # 4. define main train loop train_op = Concurrently([replay_op, reservoir_op, store_op], mode="round_robin") return LowMemoryMetricsReporting(train_op, workers, config)
def execution_plan( workers: WorkerSet, config: TrainerConfigDict, **kwargs ) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "QMIX execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) train_op = ( Replay(local_buffer=replay_buffer) .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ) .for_each(TrainOneStep(workers)) .for_each( UpdateTargetNetwork(workers, config["target_network_update_freq"]) ) ) merged_op = Concurrently( [store_op, train_op], mode="round_robin", output_indexes=[1] ) return StandardMetricsReporting(merged_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "Alpha zero execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["simple_optimizer"]: train_op = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )).for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"])) else: replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = (Replay(local_buffer=replay_buffer).filter( WaitUntilTimestepsElapsed(config["learning_starts"])).combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )).for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"]))) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def gather_experiences_directly(workers, config): rollouts = ParallelRollouts( workers, mode="async", num_async=config["max_requests_in_flight_per_sampler_worker"], ) # Augment with replay and concat to desired train batch size. train_batches = ( rollouts.for_each(lambda batch: batch.decompress_if_needed()) .for_each( MixInReplay( num_slots=config["replay_buffer_num_slots"], replay_proportion=config["replay_proportion"], ) ) .flatten() .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ) ) return train_batches
def test_standardize(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="async") b = a.for_each(StandardizeFields(["t"])) batch = next(b) assert abs(np.mean(batch["t"])) < 0.001, batch assert abs(np.std(batch["t"]) - 1.0) < 0.001, batch
def test_avg_gradients(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(ComputeGradients(workers)).batch(4) c = b.for_each(AverageGradients()) grads, counts = next(c) assert counts == 400, counts
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"]) rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(TrainOneStep(workers)) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): if config.get("prioritized_replay"): prio_args = { "prioritized_replay_alpha": config["prioritized_replay_alpha"], "prioritized_replay_beta": config["prioritized_replay_beta"], "prioritized_replay_eps": config["prioritized_replay_eps"], } else: prio_args = {} local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], multiagent_sync_replay=config.get("multiagent_sync_replay"), **prio_args) rollouts = ParallelRollouts(workers, mode="bulk_sync") # We execute the following steps concurrently: # (1) Generate rollouts and store them in our local replay buffer. Calling # next() on store_op drives this. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) def update_prio(item): samples, info_dict = item if config.get("prioritized_replay"): prio_dict = {} for policy_id, info in info_dict.items(): # TODO(sven): This is currently structured differently for # torch/tf. Clean up these results/info dicts across # policies (note: fixing this in torch_policy.py will # break e.g. DDPPO!). td_error = info.get("td_error", info[LEARNER_STATS_KEY].get("td_error")) prio_dict[policy_id] = (samples.policy_batches[policy_id] .data.get("batch_indexes"), td_error) local_replay_buffer.update_priorities(prio_dict) return info_dict # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step, and then we decide whether to update the target network. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(lambda x: post_fn(x, workers, config)) \ .for_each(TrainOneStep(workers)) \ .for_each(update_prio) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). Only return the output # of (2) since training metrics are not available until (2) runs. train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def test_compute_gradients(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(ComputeGradients(workers)) grads, counts = next(b) assert counts == 100, counts timers = a.shared_metrics.get().timers assert "compute_grads" in timers
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect large batches of relevant experiences & standardize. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) rollouts = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"])) rollouts = rollouts.for_each(StandardizeFields(["advantages"])) if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"])) # Callback to update the KL based on optimization info. def update_kl(item): _, fetches = item def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.warning("No data for {}, not updating kl".format(pi_id)) workers.local_worker().foreach_trainable_policy(update) # Update KL after each round of training. train_op = train_op.for_each(update_kl) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: _warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert (len(kwargs) == 0 ), "PPO execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(local_worker=workers.local_worker())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"], )) else: train_op = rollouts.for_each( MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], _fake_gpus=config["_fake_gpus"], )) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each( UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config).for_each( lambda result: warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: rollouts = ParallelRollouts(workers, mode="async") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(local_worker=workers.local_worker())) # Return training metrics. return StandardMetricsReporting(rollouts, workers, config)
def execution_plan(trainer: Trainer, workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the Simple Q algorithm. Defines the distributed dataflow. Args: trainer (Trainer): The Trainer object creating the execution plan. workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"]) # Assign to Trainer, so we can store the LocalReplayBuffer's # data when we save checkpoints. trainer.local_replay_buffer = local_replay_buffer rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): if config.get("prioritized_replay"): prio_args = { "prioritized_replay_alpha": config["prioritized_replay_alpha"], "prioritized_replay_beta": config["prioritized_replay_beta"], "prioritized_replay_eps": config["prioritized_replay_eps"], } else: prio_args = {} local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config.get("replay_sequence_length", 1), **prio_args) global replay_buffer replay_buffer = local_replay_buffer rollouts = ParallelRollouts(workers, mode="bulk_sync") store_op = rollouts.for_each( NoOpReplayBuffer(local_buffer=local_replay_buffer)) def update_prio(item): samples, info_dict = item if config.get("prioritized_replay"): prio_dict = {} for policy_id, info in info_dict.items(): td_error = info.get("td_error", info[LEARNER_STATS_KEY].get("td_error")) prio_dict[policy_id] = ( samples.policy_batches[policy_id].get("batch_indexes"), td_error) local_replay_buffer.update_priorities(prio_dict) return info_dict post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(lambda x: post_fn(x, workers, config)) \ .for_each(TrainOneStep(workers)) \ .for_each(update_prio) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1], round_robin_weights=calculate_rr_weights(config)) return StandardMetricsReporting(train_op, workers, config)
def test_train_one_step(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(TrainOneStep(workers)) assert "learner_stats" in next(b) counters = a.shared_metrics.get().counters assert counters["num_steps_sampled"] == 100, counters assert counters["num_steps_trained"] == 100, counters timers = a.shared_metrics.get().timers assert "learn" in timers workers.stop()
def test_train_one_step(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(TrainOneStep(workers)) batch, stats = next(b) assert isinstance(batch, SampleBatch) assert "default_policy" in stats assert "learner_stats" in stats["default_policy"] counters = a.shared_metrics.get().counters assert counters["num_steps_sampled"] == 100, counters assert counters["num_steps_trained"] == 100, counters timers = a.shared_metrics.get().timers assert "learn" in timers workers.stop()
def off_policy_execution_plan(workers: WorkerSet, config: TrainerConfigDict): """RLlib's default execution plan with an added warmup phase.""" # Collects experiences in parallel from multiple RolloutWorker actors. rollouts = ParallelRollouts(workers, mode="bulk_sync") # On the first iteration, combine experience batches until we hit `learning_starts` # in size. rollouts = rollouts.combine( LearningStarts(learning_starts=config["learning_starts"])) # Then, train the policy on those experiences and update the workers. train_op = rollouts.for_each(TrainOneStep(workers)) # Add on the standard episode reward, etc. metrics reporting. This returns # a LocalIterator[metrics_dict] representing metrics for each train step. return StandardMetricsReporting(train_op, workers, config)
def test_train_one_step(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(TrainOneStep(workers)) batch, stats = next(b) assert isinstance(batch, SampleBatch) assert DEFAULT_POLICY_ID in stats assert "learner_stats" in stats[DEFAULT_POLICY_ID] counters = a.shared_metrics.get().counters assert counters[STEPS_SAMPLED_COUNTER] == 100, counters assert counters[STEPS_TRAINED_COUNTER] == 100, counters timers = a.shared_metrics.get().timers assert "learn" in timers workers.stop()
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the SlateQ algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"], ) rollouts = ParallelRollouts(workers, mode="bulk_sync") # We execute the following steps concurrently: # (1) Generate rollouts and store them in our local replay buffer. Calling # next() on store_op drives this. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(TrainOneStep(workers)) if config["slateq_strategy"] != "RANDOM": # Alternate deterministically between (1) and (2). Only return the # output of (2) since training metrics are not available until (2) # runs. train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1], round_robin_weights=calculate_round_robin_weights(config)) else: # No training is needed for the RANDOM strategy. train_op = rollouts return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): # Special Replay Buffer for Dreamer agent episode_buffer = EpisodicBuffer(length=config["batch_length"]) local_worker = workers.local_worker() # Prefill episode buffer with initial exploration (uniform sampling) while total_sampled_timesteps(local_worker) < config["prefill_timesteps"]: samples = local_worker.sample() episode_buffer.add(samples) batch_size = config["batch_size"] dreamer_train_iters = config["dreamer_train_iters"] act_repeat = config["action_repeat"] rollouts = ParallelRollouts(workers) rollouts = rollouts.for_each( DreamerIteration(local_worker, episode_buffer, dreamer_train_iters, batch_size, act_repeat)) return rollouts
def test_store_to_replay_local(ray_start_regular_shared): buf = MultiAgentReplayBuffer(num_shards=1, learning_starts=200, capacity=1000, replay_batch_size=100, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=0.0001) assert buf.replay() is None workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(StoreToReplayBuffer(local_buffer=buf)) next(b) assert buf.replay() is None # learning hasn't started yet next(b) assert buf.replay().count == 100 replay_op = Replay(local_buffer=buf) assert next(replay_op).count == 100
def test_store_to_replay_actor(ray_start_regular_shared): actor = ReplayActor.remote(num_shards=1, learning_starts=200, buffer_size=1000, replay_batch_size=100, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=0.0001) assert ray.get(actor.replay.remote()) is None workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(StoreToReplayBuffer(actors=[actor])) next(b) assert ray.get(actor.replay.remote()) is None # learning hasn't started next(b) assert ray.get(actor.replay.remote()).count == 100 replay_op = Replay(actors=[actor]) assert next(replay_op).count == 100